jemalloc/src/psset.c
guangli-dai c067a55c79 Introducing a new usize calculation policy
Converting size to usize is what jemalloc has been done by ceiling
size to the closest size class. However, this causes lots of memory
wastes with HPA enabled.  This commit changes how usize is calculated so
that the gap between two contiguous usize is no larger than a page.
Specifically, this commit includes the following changes:

1. Adding a build-time config option (--enable-limit-usize-gap) and a
runtime one (limit_usize_gap) to guard the changes.
When build-time
config is enabled, some minor CPU overhead is expected because usize
will be stored and accessed apart from index.  When runtime option is
also enabled (it can only be enabled with the build-time config
enabled). a new usize calculation approach wil be employed.  This new
calculation will ceil size to the closest multiple of PAGE for all sizes
larger than USIZE_GROW_SLOW_THRESHOLD instead of using the size classes.
Note when the build-time config is enabled, the runtime option is
default on.

2. Prepare tcache for size to grow by PAGE over GROUP*PAGE.
To prepare for the upcoming changes where size class grows by PAGE when
larger than NGROUP * PAGE, disable the tcache when it is larger than 2 *
NGROUP * PAGE. The threshold for tcache is set higher to prevent perf
regression as much as possible while usizes between NGROUP * PAGE and 2 *
NGROUP * PAGE happen to grow by PAGE.

3. Prepare pac and hpa psset for size to grow by PAGE over GROUP*PAGE
For PAC, to avoid having too many bins, arena bins still have the same
layout.  This means some extra search is needed for a page-level request that
is not aligned with the orginal size class: it should also search the heap
before the current index since the previous heap might also be able to
have some allocations satisfying it.  The same changes apply to HPA's
psset.
This search relies on the enumeration of the heap because not all allocs in
the previous heap are guaranteed to satisfy the request.  To balance the
memory and CPU overhead, we currently enumerate at most a fixed number
of nodes before concluding none can satisfy the request during an
enumeration.

4. Add bytes counter to arena large stats.
To prepare for the upcoming usize changes, stats collected by
multiplying alive allocations and the bin size is no longer accurate.
Thus, add separate counters to record the bytes malloced and dalloced.

5. Change structs use when freeing to avoid using index2size for large sizes.
  - Change the definition of emap_alloc_ctx_t
  - Change the read of both from edata_t.
  - Change the assignment and usage of emap_alloc_ctx_t.
  - Change other callsites of index2size.
Note for the changes in the data structure, i.e., emap_alloc_ctx_t,
will be used when the build-time config (--enable-limit-usize-gap) is
enabled but they will store the same value as index2size(szind) if the
runtime option (opt_limit_usize_gap) is not enabled.

6. Adapt hpa to the usize changes.
Change the settings in sec to limit is usage for sizes larger than
USIZE_GROW_SLOW_THRESHOLD and modify corresponding tests.

7. Modify usize calculation and corresponding tests.
Change the sz_s2u_compute. Note sz_index2size is not always safe now
while sz_size2index still works as expected.
2025-03-06 15:08:13 -08:00

441 lines
13 KiB
C

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"
#include "jemalloc/internal/psset.h"
#include "jemalloc/internal/fb.h"
void
psset_init(psset_t *psset) {
for (unsigned i = 0; i < PSSET_NPSIZES; i++) {
hpdata_age_heap_new(&psset->pageslabs[i]);
}
fb_init(psset->pageslab_bitmap, PSSET_NPSIZES);
memset(&psset->stats, 0, sizeof(psset->stats));
hpdata_empty_list_init(&psset->empty);
for (int i = 0; i < PSSET_NPURGE_LISTS; i++) {
hpdata_purge_list_init(&psset->to_purge[i]);
}
fb_init(psset->purge_bitmap, PSSET_NPURGE_LISTS);
hpdata_hugify_list_init(&psset->to_hugify);
}
static void
psset_bin_stats_accum(psset_bin_stats_t *dst, psset_bin_stats_t *src) {
dst->npageslabs += src->npageslabs;
dst->nactive += src->nactive;
dst->ndirty += src->ndirty;
}
void
psset_stats_accum(psset_stats_t *dst, psset_stats_t *src) {
psset_bin_stats_accum(&dst->merged, &src->merged);
for (int huge = 0; huge < PSSET_NHUGE; huge++) {
psset_bin_stats_accum(&dst->slabs[huge], &src->slabs[huge]);
psset_bin_stats_accum(&dst->full_slabs[huge],
&src->full_slabs[huge]);
psset_bin_stats_accum(&dst->empty_slabs[huge],
&src->empty_slabs[huge]);
}
for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
psset_bin_stats_accum(&dst->nonfull_slabs[i][0],
&src->nonfull_slabs[i][0]);
psset_bin_stats_accum(&dst->nonfull_slabs[i][1],
&src->nonfull_slabs[i][1]);
}
}
/*
* The stats maintenance strategy is to remove a pageslab's contribution to the
* stats when we call psset_update_begin, and re-add it (to a potentially new
* bin) when we call psset_update_end.
*/
JEMALLOC_ALWAYS_INLINE void
psset_slab_stats_insert_remove(psset_stats_t *stats,
psset_bin_stats_t *binstats, hpdata_t *ps, bool insert) {
size_t mul = insert ? (size_t)1 : (size_t)-1;
size_t nactive = hpdata_nactive_get(ps);
size_t ndirty = hpdata_ndirty_get(ps);
stats->merged.npageslabs += mul * 1;
stats->merged.nactive += mul * nactive;
stats->merged.ndirty += mul * ndirty;
/*
* Stats above are necessary for purging logic to work, everything
* below is to improve observability, thense is optional, so we don't
* update it, when stats disabled.
*/
if (!config_stats) {
return;
}
size_t huge_idx = (size_t)hpdata_huge_get(ps);
stats->slabs[huge_idx].npageslabs += mul * 1;
stats->slabs[huge_idx].nactive += mul * nactive;
stats->slabs[huge_idx].ndirty += mul * ndirty;
binstats[huge_idx].npageslabs += mul * 1;
binstats[huge_idx].nactive += mul * nactive;
binstats[huge_idx].ndirty += mul * ndirty;
if (config_debug) {
psset_bin_stats_t check_stats[PSSET_NHUGE] = {{0}};
for (int huge = 0; huge < PSSET_NHUGE; huge++) {
psset_bin_stats_accum(&check_stats[huge],
&stats->full_slabs[huge]);
psset_bin_stats_accum(&check_stats[huge],
&stats->empty_slabs[huge]);
for (pszind_t pind = 0; pind < PSSET_NPSIZES; pind++) {
psset_bin_stats_accum(&check_stats[huge],
&stats->nonfull_slabs[pind][huge]);
}
}
assert(stats->merged.npageslabs
== check_stats[0].npageslabs + check_stats[1].npageslabs);
assert(stats->merged.nactive
== check_stats[0].nactive + check_stats[1].nactive);
assert(stats->merged.ndirty
== check_stats[0].ndirty + check_stats[1].ndirty);
for (int huge = 0; huge < PSSET_NHUGE; huge++) {
assert(stats->slabs[huge].npageslabs
== check_stats[huge].npageslabs);
assert(stats->slabs[huge].nactive
== check_stats[huge].nactive);
assert(stats->slabs[huge].ndirty
== check_stats[huge].ndirty);
}
}
}
static void
psset_slab_stats_insert(psset_stats_t *stats, psset_bin_stats_t *binstats,
hpdata_t *ps) {
psset_slab_stats_insert_remove(stats, binstats, ps, true);
}
static void
psset_slab_stats_remove(psset_stats_t *stats, psset_bin_stats_t *binstats,
hpdata_t *ps) {
psset_slab_stats_insert_remove(stats, binstats, ps, false);
}
static pszind_t
psset_hpdata_heap_index(const hpdata_t *ps) {
assert(!hpdata_full(ps));
assert(!hpdata_empty(ps));
size_t longest_free_range = hpdata_longest_free_range_get(ps);
pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(
longest_free_range << LG_PAGE));
assert(pind < PSSET_NPSIZES);
return pind;
}
static void
psset_hpdata_heap_remove(psset_t *psset, hpdata_t *ps) {
pszind_t pind = psset_hpdata_heap_index(ps);
hpdata_age_heap_remove(&psset->pageslabs[pind], ps);
if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
fb_unset(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind);
}
}
static void
psset_hpdata_heap_insert(psset_t *psset, hpdata_t *ps) {
pszind_t pind = psset_hpdata_heap_index(ps);
if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
fb_set(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind);
}
hpdata_age_heap_insert(&psset->pageslabs[pind], ps);
}
static void
psset_stats_insert(psset_t *psset, hpdata_t *ps) {
psset_stats_t *stats = &psset->stats;
if (hpdata_empty(ps)) {
psset_slab_stats_insert(stats, psset->stats.empty_slabs, ps);
} else if (hpdata_full(ps)) {
psset_slab_stats_insert(stats, psset->stats.full_slabs, ps);
} else {
pszind_t pind = psset_hpdata_heap_index(ps);
psset_slab_stats_insert(stats, psset->stats.nonfull_slabs[pind],
ps);
}
}
static void
psset_stats_remove(psset_t *psset, hpdata_t *ps) {
psset_stats_t *stats = &psset->stats;
if (hpdata_empty(ps)) {
psset_slab_stats_remove(stats, psset->stats.empty_slabs, ps);
} else if (hpdata_full(ps)) {
psset_slab_stats_remove(stats, psset->stats.full_slabs, ps);
} else {
pszind_t pind = psset_hpdata_heap_index(ps);
psset_slab_stats_remove(stats, psset->stats.nonfull_slabs[pind],
ps);
}
}
/*
* Put ps into some container so that it can be found during future allocation
* requests.
*/
static void
psset_alloc_container_insert(psset_t *psset, hpdata_t *ps) {
assert(!hpdata_in_psset_alloc_container_get(ps));
hpdata_in_psset_alloc_container_set(ps, true);
if (hpdata_empty(ps)) {
/*
* This prepend, paired with popping the head in psset_fit,
* means we implement LIFO ordering for the empty slabs set,
* which seems reasonable.
*/
hpdata_empty_list_prepend(&psset->empty, ps);
} else if (hpdata_full(ps)) {
/*
* We don't need to keep track of the full slabs; we're never
* going to return them from a psset_pick_alloc call.
*/
} else {
psset_hpdata_heap_insert(psset, ps);
}
}
/* Remove ps from those collections. */
static void
psset_alloc_container_remove(psset_t *psset, hpdata_t *ps) {
assert(hpdata_in_psset_alloc_container_get(ps));
hpdata_in_psset_alloc_container_set(ps, false);
if (hpdata_empty(ps)) {
hpdata_empty_list_remove(&psset->empty, ps);
} else if (hpdata_full(ps)) {
/* Same as above -- do nothing in this case. */
} else {
psset_hpdata_heap_remove(psset, ps);
}
}
static size_t
psset_purge_list_ind(hpdata_t *ps) {
size_t ndirty = hpdata_ndirty_get(ps);
/* Shouldn't have something with no dirty pages purgeable. */
assert(ndirty > 0);
/*
* Higher indices correspond to lists we'd like to purge earlier; make
* the two highest indices correspond to empty lists, which we attempt
* to purge before purging any non-empty list. This has two advantages:
* - Empty page slabs are the least likely to get reused (we'll only
* pick them for an allocation if we have no other choice).
* - Empty page slabs can purge every dirty page they contain in a
* single call, which is not usually the case.
*
* We purge hugeified empty slabs before nonhugeified ones, on the basis
* that they are fully dirty, while nonhugified slabs might not be, so
* we free up more pages more easily.
*/
if (hpdata_nactive_get(ps) == 0) {
if (hpdata_huge_get(ps)) {
return PSSET_NPURGE_LISTS - 1;
} else {
return PSSET_NPURGE_LISTS - 2;
}
}
pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(ndirty << LG_PAGE));
/*
* For non-empty slabs, we may reuse them again. Prefer purging
* non-hugeified slabs before hugeified ones then, among pages of
* similar dirtiness. We still get some benefit from the hugification.
*/
return (size_t)pind * 2 + (hpdata_huge_get(ps) ? 0 : 1);
}
static void
psset_maybe_remove_purge_list(psset_t *psset, hpdata_t *ps) {
/*
* Remove the hpdata from its purge list (if it's in one). Even if it's
* going to stay in the same one, by appending it during
* psset_update_end, we move it to the end of its queue, so that we
* purge LRU within a given dirtiness bucket.
*/
if (hpdata_purge_allowed_get(ps)) {
size_t ind = psset_purge_list_ind(ps);
hpdata_purge_list_t *purge_list = &psset->to_purge[ind];
hpdata_purge_list_remove(purge_list, ps);
if (hpdata_purge_list_empty(purge_list)) {
fb_unset(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind);
}
}
}
static void
psset_maybe_insert_purge_list(psset_t *psset, hpdata_t *ps) {
if (hpdata_purge_allowed_get(ps)) {
size_t ind = psset_purge_list_ind(ps);
hpdata_purge_list_t *purge_list = &psset->to_purge[ind];
if (hpdata_purge_list_empty(purge_list)) {
fb_set(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind);
}
hpdata_purge_list_append(purge_list, ps);
}
}
void
psset_update_begin(psset_t *psset, hpdata_t *ps) {
hpdata_assert_consistent(ps);
assert(hpdata_in_psset_get(ps));
hpdata_updating_set(ps, true);
psset_stats_remove(psset, ps);
if (hpdata_in_psset_alloc_container_get(ps)) {
/*
* Some metadata updates can break alloc container invariants
* (e.g. the longest free range determines the hpdata_heap_t the
* pageslab lives in).
*/
assert(hpdata_alloc_allowed_get(ps));
psset_alloc_container_remove(psset, ps);
}
psset_maybe_remove_purge_list(psset, ps);
/*
* We don't update presence in the hugify list; we try to keep it FIFO,
* even in the presence of other metadata updates. We'll update
* presence at the end of the metadata update if necessary.
*/
}
void
psset_update_end(psset_t *psset, hpdata_t *ps) {
assert(hpdata_in_psset_get(ps));
hpdata_updating_set(ps, false);
psset_stats_insert(psset, ps);
/*
* The update begin should have removed ps from whatever alloc container
* it was in.
*/
assert(!hpdata_in_psset_alloc_container_get(ps));
if (hpdata_alloc_allowed_get(ps)) {
psset_alloc_container_insert(psset, ps);
}
psset_maybe_insert_purge_list(psset, ps);
if (hpdata_hugify_allowed_get(ps)
&& !hpdata_in_psset_hugify_container_get(ps)) {
hpdata_in_psset_hugify_container_set(ps, true);
hpdata_hugify_list_append(&psset->to_hugify, ps);
} else if (!hpdata_hugify_allowed_get(ps)
&& hpdata_in_psset_hugify_container_get(ps)) {
hpdata_in_psset_hugify_container_set(ps, false);
hpdata_hugify_list_remove(&psset->to_hugify, ps);
}
hpdata_assert_consistent(ps);
}
hpdata_t *
psset_enumerate_search(psset_t *psset, pszind_t pind, size_t size) {
if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
return NULL;
}
hpdata_t *ps = NULL;
hpdata_age_heap_enumerate_helper_t helper;
hpdata_age_heap_enumerate_prepare(&psset->pageslabs[pind], &helper,
PSSET_ENUMERATE_MAX_NUM, sizeof(helper.bfs_queue) / sizeof(void *));
while ((ps = hpdata_age_heap_enumerate_next(&psset->pageslabs[pind],
&helper))) {
if (hpdata_longest_free_range_get(ps) >= size) {
return ps;
}
}
return NULL;
}
hpdata_t *
psset_pick_alloc(psset_t *psset, size_t size) {
assert((size & PAGE_MASK) == 0);
assert(size <= HUGEPAGE);
pszind_t min_pind = sz_psz2ind(sz_psz_quantize_ceil(size));
hpdata_t *ps = NULL;
/* See comments in eset_first_fit for why we enumerate search below. */
pszind_t pind_prev = sz_psz2ind(sz_psz_quantize_floor(size));
if (sz_limit_usize_gap_enabled() && pind_prev < min_pind) {
ps = psset_enumerate_search(psset, pind_prev, size);
if (ps != NULL) {
return ps;
}
}
pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap, PSSET_NPSIZES,
(size_t)min_pind);
if (pind == PSSET_NPSIZES) {
return hpdata_empty_list_first(&psset->empty);
}
ps = hpdata_age_heap_first(&psset->pageslabs[pind]);
if (ps == NULL) {
return NULL;
}
hpdata_assert_consistent(ps);
return ps;
}
hpdata_t *
psset_pick_purge(psset_t *psset) {
ssize_t ind_ssz = fb_fls(psset->purge_bitmap, PSSET_NPURGE_LISTS,
PSSET_NPURGE_LISTS - 1);
if (ind_ssz < 0) {
return NULL;
}
pszind_t ind = (pszind_t)ind_ssz;
assert(ind < PSSET_NPURGE_LISTS);
hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]);
assert(ps != NULL);
return ps;
}
hpdata_t *
psset_pick_hugify(psset_t *psset) {
return hpdata_hugify_list_first(&psset->to_hugify);
}
void
psset_insert(psset_t *psset, hpdata_t *ps) {
hpdata_in_psset_set(ps, true);
psset_stats_insert(psset, ps);
if (hpdata_alloc_allowed_get(ps)) {
psset_alloc_container_insert(psset, ps);
}
psset_maybe_insert_purge_list(psset, ps);
if (hpdata_hugify_allowed_get(ps)) {
hpdata_in_psset_hugify_container_set(ps, true);
hpdata_hugify_list_append(&psset->to_hugify, ps);
}
}
void
psset_remove(psset_t *psset, hpdata_t *ps) {
hpdata_in_psset_set(ps, false);
psset_stats_remove(psset, ps);
if (hpdata_in_psset_alloc_container_get(ps)) {
psset_alloc_container_remove(psset, ps);
}
psset_maybe_remove_purge_list(psset, ps);
if (hpdata_in_psset_hugify_container_get(ps)) {
hpdata_in_psset_hugify_container_set(ps, false);
hpdata_hugify_list_remove(&psset->to_hugify, ps);
}
}