Keep hugepages in use as long as we can

Hugepages are really hard to get. Currently, we are waiting until we
fill memory region up with data to at least `hpa_hugification_threshold`
and then wait for `hpa_hugify_delay_ms` before we hugify pageslab. For
this reason it seems wasteful to treat hugified pageslabs in the same
way as non-hugified ones. Based on that observation two ideas come to
mind.

We should try to prioritize placing allocation on hugified pageslab to
get performance improvements from hugepage usage immediately. While
there are maybe a better (in terms of fragmentation) pageslab currently
available, empty space on a hugepage just sitting there, waiting for
a better allocation to appear, which might never happen. This unused
memory on a hugepage is counted towards out usage anyway, we better
use it for good.

Same reasoning is applicable for purging prioritization. If we purge
hugepage (`madvise(..., MADV_DONTNEED)`) we'll need to start over again
to assemble it back: filling it up and waiting. Moreover, we might never
assemble hugepage again, because kernel doesn't have continuous 2 MiB
regions anymore. Instead, we should purge non-huge pageslabs as long as
we can, because they are much cheaper to purge and does not provide any
performance benefits.
This commit is contained in:
Dmitry Ilvokhin 2024-11-20 06:56:42 -08:00
parent 6092c980a6
commit b64d7815b7
3 changed files with 239 additions and 139 deletions

View file

@ -27,17 +27,6 @@
*/
#define PSSET_NHUGE 2
/*
* We keep two purge lists per page size class; one for hugified hpdatas (at
* index 2*pszind), and one for the non-hugified hpdatas (at index 2*pszind +
* 1). This lets us implement a preference for purging non-hugified hpdatas
* among similarly-dirty ones.
* We reserve the last two indices for empty slabs, in that case purging
* hugified ones (which are definitionally all waste) before non-hugified ones
* (i.e. reversing the order).
*/
#define PSSET_NPURGE_LISTS (2 * PSSET_NPSIZES)
typedef struct psset_bin_stats_s psset_bin_stats_t;
struct psset_bin_stats_s {
/* How many pageslabs are in this bin? */
@ -65,11 +54,7 @@ struct psset_stats_s {
/* Non-huge and huge slabs. */
psset_bin_stats_t slabs[PSSET_NHUGE];
/*
* The second index is huge stats; nonfull_slabs[pszind][0] contains
* stats for the non-huge slabs in bucket pszind, while
* nonfull_slabs[pszind][1] contains stats for the huge slabs.
*/
/* Non-full slabs, distinguished for non-huge and huge slabs. */
psset_bin_stats_t nonfull_slabs[PSSET_NPSIZES][PSSET_NHUGE];
/*
@ -88,9 +73,9 @@ struct psset_s {
* The pageslabs, quantized by the size class of the largest contiguous
* free run of pages in a pageslab.
*/
hpdata_age_heap_t pageslabs[PSSET_NPSIZES];
hpdata_age_heap_t pageslabs[PSSET_NHUGE][PSSET_NPSIZES];
/* Bitmap for which set bits correspond to non-empty heaps. */
fb_group_t pageslab_bitmap[FB_NGROUPS(PSSET_NPSIZES)];
fb_group_t pageslab_bitmap[PSSET_NHUGE][FB_NGROUPS(PSSET_NPSIZES)];
psset_stats_t stats;
/*
* Slabs with no active allocations, but which are allowed to serve new
@ -102,9 +87,9 @@ struct psset_s {
* to purge them (with later indices indicating slabs we want to purge
* more).
*/
hpdata_purge_list_t to_purge[PSSET_NPURGE_LISTS];
hpdata_purge_list_t to_purge[PSSET_NHUGE][PSSET_NPSIZES];
/* Bitmap for which set bits correspond to non-empty purge lists. */
fb_group_t purge_bitmap[FB_NGROUPS(PSSET_NPURGE_LISTS)];
fb_group_t purge_bitmap[PSSET_NHUGE][FB_NGROUPS(PSSET_NPSIZES)];
/* Slabs which are available to be hugified. */
hpdata_hugify_list_t to_hugify;
};

View file

@ -5,18 +5,32 @@
#include "jemalloc/internal/fb.h"
static void
psset_init_pageslabs(hpdata_age_heap_t *pageslabs) {
for (int i = 0; i < PSSET_NPSIZES; i++) {
hpdata_age_heap_new(&pageslabs[i]);
}
}
static void
psset_init_to_purge(hpdata_purge_list_t *to_purge) {
for (int i = 0; i < PSSET_NPSIZES; i++) {
hpdata_purge_list_init(&to_purge[i]);
}
}
void
psset_init(psset_t *psset) {
for (unsigned i = 0; i < PSSET_NPSIZES; i++) {
hpdata_age_heap_new(&psset->pageslabs[i]);
for (int huge = 0; huge < PSSET_NHUGE; huge++) {
psset_init_pageslabs(psset->pageslabs[huge]);
fb_init(psset->pageslab_bitmap[huge], PSSET_NPSIZES);
}
fb_init(psset->pageslab_bitmap, PSSET_NPSIZES);
memset(&psset->stats, 0, sizeof(psset->stats));
hpdata_empty_list_init(&psset->empty);
for (int i = 0; i < PSSET_NPURGE_LISTS; i++) {
hpdata_purge_list_init(&psset->to_purge[i]);
for (int huge = 0; huge < PSSET_NHUGE; huge++) {
psset_init_to_purge(psset->to_purge[huge]);
fb_init(psset->purge_bitmap[huge], PSSET_NPSIZES);
}
fb_init(psset->purge_bitmap, PSSET_NPURGE_LISTS);
hpdata_hugify_list_init(&psset->to_hugify);
}
@ -45,6 +59,11 @@ psset_stats_accum(psset_stats_t *dst, psset_stats_t *src) {
}
}
static size_t
psset_hpdata_huge_index(const hpdata_t *ps) {
return (size_t)hpdata_huge_get(ps);
}
/*
* The stats maintenance strategy is to remove a pageslab's contribution to the
* stats when we call psset_update_begin, and re-add it (to a potentially new
@ -70,7 +89,7 @@ psset_slab_stats_insert_remove(psset_stats_t *stats,
return;
}
size_t huge_idx = (size_t)hpdata_huge_get(ps);
size_t huge_idx = psset_hpdata_huge_index(ps);
stats->slabs[huge_idx].npageslabs += mul * 1;
stats->slabs[huge_idx].nactive += mul * nactive;
@ -136,20 +155,26 @@ psset_hpdata_heap_index(const hpdata_t *ps) {
static void
psset_hpdata_heap_remove(psset_t *psset, hpdata_t *ps) {
size_t huge_idx = psset_hpdata_huge_index(ps);
pszind_t pind = psset_hpdata_heap_index(ps);
hpdata_age_heap_remove(&psset->pageslabs[pind], ps);
if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
fb_unset(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind);
hpdata_age_heap_t *heap = &psset->pageslabs[huge_idx][pind];
hpdata_age_heap_remove(heap, ps);
if (hpdata_age_heap_empty(heap)) {
fb_unset(psset->pageslab_bitmap[huge_idx], PSSET_NPSIZES,
(size_t)pind);
}
}
static void
psset_hpdata_heap_insert(psset_t *psset, hpdata_t *ps) {
size_t huge_idx = psset_hpdata_huge_index(ps);
pszind_t pind = psset_hpdata_heap_index(ps);
if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
fb_set(psset->pageslab_bitmap, PSSET_NPSIZES, (size_t)pind);
hpdata_age_heap_t *heap = &psset->pageslabs[huge_idx][pind];
if (hpdata_age_heap_empty(heap)) {
fb_set(psset->pageslab_bitmap[huge_idx], PSSET_NPSIZES,
(size_t)pind);
}
hpdata_age_heap_insert(&psset->pageslabs[pind], ps);
hpdata_age_heap_insert(heap, ps);
}
static void
@ -227,32 +252,18 @@ psset_purge_list_ind(hpdata_t *ps) {
assert(ndirty > 0);
/*
* Higher indices correspond to lists we'd like to purge earlier; make
* the two highest indices correspond to empty lists, which we attempt
* the highest index correspond to empty list, which we attempt
* to purge before purging any non-empty list. This has two advantages:
* - Empty page slabs are the least likely to get reused (we'll only
* pick them for an allocation if we have no other choice).
* - Empty page slabs can purge every dirty page they contain in a
* single call, which is not usually the case.
*
* We purge hugeified empty slabs before nonhugeified ones, on the basis
* that they are fully dirty, while nonhugified slabs might not be, so
* we free up more pages more easily.
*/
if (hpdata_nactive_get(ps) == 0) {
if (hpdata_huge_get(ps)) {
return PSSET_NPURGE_LISTS - 1;
} else {
return PSSET_NPURGE_LISTS - 2;
}
return PSSET_NPSIZES - 1;
}
pszind_t pind = sz_psz2ind(sz_psz_quantize_floor(ndirty << LG_PAGE));
/*
* For non-empty slabs, we may reuse them again. Prefer purging
* non-hugeified slabs before hugeified ones then, among pages of
* similar dirtiness. We still get some benefit from the hugification.
*/
return (size_t)pind * 2 + (hpdata_huge_get(ps) ? 0 : 1);
return sz_psz2ind(sz_psz_quantize_floor(ndirty << LG_PAGE));
}
static void
@ -264,11 +275,13 @@ psset_maybe_remove_purge_list(psset_t *psset, hpdata_t *ps) {
* purge LRU within a given dirtiness bucket.
*/
if (hpdata_purge_allowed_get(ps)) {
size_t huge = psset_hpdata_huge_index(ps);
size_t ind = psset_purge_list_ind(ps);
hpdata_purge_list_t *purge_list = &psset->to_purge[ind];
hpdata_purge_list_t *purge_list = &psset->to_purge[huge][ind];
hpdata_purge_list_remove(purge_list, ps);
if (hpdata_purge_list_empty(purge_list)) {
fb_unset(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind);
fb_unset(psset->purge_bitmap[huge], PSSET_NPSIZES,
ind);
}
}
}
@ -276,10 +289,11 @@ psset_maybe_remove_purge_list(psset_t *psset, hpdata_t *ps) {
static void
psset_maybe_insert_purge_list(psset_t *psset, hpdata_t *ps) {
if (hpdata_purge_allowed_get(ps)) {
size_t huge = psset_hpdata_huge_index(ps);
size_t ind = psset_purge_list_ind(ps);
hpdata_purge_list_t *purge_list = &psset->to_purge[ind];
hpdata_purge_list_t *purge_list = &psset->to_purge[huge][ind];
if (hpdata_purge_list_empty(purge_list)) {
fb_set(psset->purge_bitmap, PSSET_NPURGE_LISTS, ind);
fb_set(psset->purge_bitmap[huge], PSSET_NPSIZES, ind);
}
hpdata_purge_list_append(purge_list, ps);
}
@ -343,33 +357,70 @@ psset_pick_alloc(psset_t *psset, size_t size) {
assert(size <= HUGEPAGE);
pszind_t min_pind = sz_psz2ind(sz_psz_quantize_ceil(size));
pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap, PSSET_NPSIZES,
(size_t)min_pind);
if (pind == PSSET_NPSIZES) {
return hpdata_empty_list_first(&psset->empty);
}
hpdata_t *ps = hpdata_age_heap_first(&psset->pageslabs[pind]);
if (ps == NULL) {
return NULL;
/*
* Try to place allocation on already hugified page first if possible
* to better utilize them.
*/
for (int huge = PSSET_NHUGE - 1; huge >= 0; --huge) {
pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap[huge],
PSSET_NPSIZES, (size_t)min_pind);
if (pind == PSSET_NPSIZES) {
continue;
}
hpdata_t *ps = hpdata_age_heap_first(
&psset->pageslabs[huge][pind]);
if (ps == NULL) {
continue;
}
hpdata_assert_consistent(ps);
return ps;
}
hpdata_assert_consistent(ps);
return ps;
/*
* Couldn't find non-full slab to place allocation on, use empty slab
* if we have one available as last resort.
*/
return hpdata_empty_list_first(&psset->empty);
}
hpdata_t *
psset_pick_purge(psset_t *psset) {
ssize_t ind_ssz = fb_fls(psset->purge_bitmap, PSSET_NPURGE_LISTS,
PSSET_NPURGE_LISTS - 1);
if (ind_ssz < 0) {
return NULL;
/*
* We purge hugeified empty slabs before nonhugeified ones, on the
* basis that they are fully dirty, while nonhugified slabs might not
* be, so we free up more pages more easily. Another reason to prefer
* purging hugified slabs is to free continious physical memory ranges
* in case there is not enough of them due to fragmentation on
* operation system level.
*/
for (ssize_t huge = PSSET_NHUGE - 1; huge >= 0; --huge) {
if (!fb_get(psset->purge_bitmap[huge], PSSET_NPSIZES,
PSSET_NPSIZES - 1)) {
continue;
}
hpdata_t *ps = hpdata_purge_list_first(
&psset->to_purge[huge][PSSET_NPSIZES - 1]);
assert(ps != NULL);
return ps;
}
pszind_t ind = (pszind_t)ind_ssz;
assert(ind < PSSET_NPURGE_LISTS);
hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]);
assert(ps != NULL);
return ps;
/* For non-empty pageslabs prioritize to purge non-hugified ones. */
for (ssize_t huge = 0; huge < PSSET_NHUGE; ++huge) {
ssize_t ind_ssz = fb_fls(psset->purge_bitmap[huge],
PSSET_NPSIZES, PSSET_NPSIZES - 1);
if (ind_ssz < 0) {
continue;
}
pszind_t ind = (pszind_t)ind_ssz;
assert(ind < PSSET_NPSIZES);
hpdata_t *ps = hpdata_purge_list_first(
&psset->to_purge[huge][ind]);
assert(ps != NULL);
return ps;
}
return NULL;
}
hpdata_t *

View file

@ -705,86 +705,149 @@ TEST_BEGIN(test_insert_remove) {
}
TEST_END
TEST_BEGIN(test_purge_prefers_nonhuge) {
/*
* All else being equal, we should prefer purging non-huge pages over
* huge ones for non-empty extents.
*/
/* Nothing magic about this constant. */
enum {
NHP = 23,
};
hpdata_t *hpdata;
TEST_BEGIN(test_alloc_prefers_huge) {
psset_t psset;
psset_init(&psset);
hpdata_t hpdata_huge[NHP];
uintptr_t huge_begin = (uintptr_t)&hpdata_huge[0];
uintptr_t huge_end = (uintptr_t)&hpdata_huge[NHP];
hpdata_t hpdata_nonhuge[NHP];
uintptr_t nonhuge_begin = (uintptr_t)&hpdata_nonhuge[0];
uintptr_t nonhuge_end = (uintptr_t)&hpdata_nonhuge[NHP];
hpdata_t nonhuge;
hpdata_init(&nonhuge, /* addr */ NULL, /* age */ 0);
psset_insert(&psset, &nonhuge);
for (size_t i = 0; i < NHP; i++) {
hpdata_init(&hpdata_huge[i], (void *)((10 + i) * HUGEPAGE),
123 + i);
psset_insert(&psset, &hpdata_huge[i]);
hpdata_t huge;
hpdata_init(&huge, /* addr */ (void *) HUGEPAGE, /* age */ 1);
psset_insert(&psset, &huge);
psset_update_begin(&psset, &huge);
hpdata_hugify(&huge);
psset_update_end(&psset, &huge);
hpdata_init(&hpdata_nonhuge[i],
(void *)((10 + NHP + i) * HUGEPAGE),
456 + i);
psset_insert(&psset, &hpdata_nonhuge[i]);
void *huge_allocs[HUGEPAGE_PAGES];
/* All allocations should be placed on huge pageslab. */
for (size_t i = 0; i < HUGEPAGE_PAGES; i++) {
hpdata_t *next = psset_pick_alloc(&psset, PAGE);
expect_ptr_eq(hpdata_addr_get(next), hpdata_addr_get(&huge),
"Picked wrong pageslab to place allocation");
expect_u64_eq(hpdata_age_get(next), hpdata_age_get(&huge), "");
psset_update_begin(&psset, next);
huge_allocs[i] = hpdata_reserve_alloc(next, PAGE);
psset_update_end(&psset, next);
}
for (int i = 0; i < 2 * NHP; i++) {
hpdata = psset_pick_alloc(&psset, HUGEPAGE * 3 / 4);
psset_update_begin(&psset, hpdata);
void *ptr;
ptr = hpdata_reserve_alloc(hpdata, HUGEPAGE * 3 / 4);
/* Ignore the first alloc, which will stick around. */
(void)ptr;
/*
* The second alloc is to dirty the pages; free it immediately
* after allocating.
*/
ptr = hpdata_reserve_alloc(hpdata, HUGEPAGE / 4);
hpdata_unreserve(hpdata, ptr, HUGEPAGE / 4);
if (huge_begin <= (uintptr_t)hpdata
&& (uintptr_t)hpdata < huge_end) {
hpdata_hugify(hpdata);
}
void *nonhuge_allocs[HUGEPAGE_PAGES];
hpdata_purge_allowed_set(hpdata, true);
psset_update_end(&psset, hpdata);
/*
* Now, when huge pageslab is full, we should place allocations on
* non-huge one.
*/
for (size_t i = 0; i < HUGEPAGE_PAGES; i++) {
hpdata_t *next = psset_pick_alloc(&psset, PAGE);
expect_ptr_eq(hpdata_addr_get(next), hpdata_addr_get(&nonhuge),
"Picked wrong pageslab to place allocation");
expect_u64_eq(hpdata_age_get(next), hpdata_age_get(&nonhuge), "");
psset_update_begin(&psset, next);
nonhuge_allocs[i] = hpdata_reserve_alloc(next, PAGE);
psset_update_end(&psset, next);
}
/*
* We've got a bunch of 1/8th dirty hpdatas. It should give us all the
* non-huge ones to purge, then all the huge ones, then refuse to purge
* further.
* Deallocate everything except one page from huge pageslab, because
* empty pageslab is a completely different story.
*/
for (int i = 0; i < NHP; i++) {
hpdata = psset_pick_purge(&psset);
assert_true(nonhuge_begin <= (uintptr_t)hpdata
&& (uintptr_t)hpdata < nonhuge_end, "");
psset_update_begin(&psset, hpdata);
test_psset_fake_purge(hpdata);
hpdata_purge_allowed_set(hpdata, false);
psset_update_end(&psset, hpdata);
for (size_t i = 0; i < HUGEPAGE_PAGES - 1; i++) {
psset_update_begin(&psset, &huge);
hpdata_unreserve(&huge, huge_allocs[i], PAGE);
hpdata_purge_allowed_set(&huge, true);
psset_update_end(&psset, &huge);
}
for (int i = 0; i < NHP; i++) {
hpdata = psset_pick_purge(&psset);
expect_true(huge_begin <= (uintptr_t)hpdata
&& (uintptr_t)hpdata < huge_end, "");
psset_update_begin(&psset, hpdata);
hpdata_dehugify(hpdata);
test_psset_fake_purge(hpdata);
hpdata_purge_allowed_set(hpdata, false);
psset_update_end(&psset, hpdata);
/* And one page from nonhuge pageslab. */
psset_update_begin(&psset, &nonhuge);
hpdata_unreserve(&nonhuge, nonhuge_allocs[0], PAGE);
hpdata_purge_allowed_set(&nonhuge, true);
psset_update_end(&psset, &nonhuge);
/*
* Next allocation should be placed on huge pageslab, despite the fact
* that nonhuge pageslab is a better fit.
*/
hpdata_t *next = psset_pick_alloc(&psset, PAGE);
expect_ptr_eq(hpdata_addr_get(next), hpdata_addr_get(&huge),
"Picked wrong pageslab to place allocation");
expect_u64_eq(hpdata_age_get(next), hpdata_age_get(&huge), "");
}
TEST_END
static void
test_do_alloc_dalloc(psset_t *psset, hpdata_t *ps, int nallocs, int ndallocs) {
assert(nallocs >= ndallocs);
VARIABLE_ARRAY(void *, ptrs, nallocs);
psset_update_begin(psset, ps);
for (int i = 0; i < nallocs; i++) {
ptrs[i] = hpdata_reserve_alloc(ps, PAGE);
}
for (int i = 0; i < ndallocs; i++) {
hpdata_unreserve(ps, ptrs[i], PAGE);
}
if (ndallocs > 0) {
hpdata_purge_allowed_set(ps, true);
}
psset_update_end(psset, ps);
}
TEST_BEGIN(test_purge_prefers_nonhuge) {
psset_t psset;
psset_init(&psset);
enum {
NALLOCS = 2,
NDALLOCS = NALLOCS - 1,
};
hpdata_t nonhuge;
hpdata_init(&nonhuge, /* addr */ NULL, /* age */ 0);
psset_insert(&psset, &nonhuge);
/* Left one active page to make slab non empty. */
test_do_alloc_dalloc(&psset, &nonhuge, NALLOCS, NDALLOCS);
hpdata_t huge;
hpdata_init(&huge, /* addr */ (void *) HUGEPAGE, /* age */ 1);
psset_insert(&psset, &huge);
psset_update_begin(&psset, &huge);
hpdata_hugify(&huge);
psset_update_end(&psset, &huge);
test_do_alloc_dalloc(&psset, &huge, NALLOCS, NDALLOCS);
/*
* Now both pageslabs have same about of dirty pages, we should purge
* from nonhuge and then, when nothing left there purge from huge.
*/
hpdata_t* purge = psset_pick_purge(&psset);
expect_ptr_eq(hpdata_addr_get(purge),
hpdata_addr_get(&nonhuge),
"Picked wrong pageslab to purge from");
expect_u64_eq(hpdata_age_get(purge), hpdata_age_get(&nonhuge),
"");
psset_update_begin(&psset, purge);
test_psset_fake_purge(purge);
hpdata_purge_allowed_set(purge, hpdata_ndirty_get(purge) > 0);
psset_update_end(&psset, purge);
purge = psset_pick_purge(&psset);
expect_ptr_eq(hpdata_addr_get(purge),
hpdata_addr_get(&huge),
"Picked wrong pageslab to purge from");
expect_u64_eq(hpdata_age_get(purge), hpdata_age_get(&huge),
"");
}
TEST_END
@ -907,6 +970,7 @@ main(void) {
test_stats_fullness,
test_oldest_fit,
test_insert_remove,
test_alloc_prefers_huge,
test_purge_prefers_nonhuge,
test_purge_prefers_empty,
test_purge_prefers_empty_huge);