[process_madvise] Use process_madvise across multiple huge_pages

This commit is contained in:
Slobodan Predolac 2025-04-05 12:14:14 -07:00 committed by Qi Wang
parent 0dfb4a5a1a
commit 1956a54a43
7 changed files with 482 additions and 78 deletions

View file

@ -232,6 +232,7 @@ TESTS_UNIT := \
$(srcroot)test/unit/hook.c \
$(srcroot)test/unit/hpa.c \
$(srcroot)test/unit/hpa_vectorized_madvise.c \
$(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \
$(srcroot)test/unit/hpa_background_thread.c \
$(srcroot)test/unit/hpdata.c \
$(srcroot)test/unit/huge.c \

View file

@ -79,4 +79,38 @@ hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_shard_t *shard) {
}
}
/*
* For purging more than one page we use batch of these items
*/
typedef struct {
hpdata_purge_state_t state;
hpdata_t *hp;
bool dehugify;
} hpa_purge_item_t;
typedef struct hpa_purge_batch_s hpa_purge_batch_t;
struct hpa_purge_batch_s {
hpa_purge_item_t *items;
size_t items_capacity;
/* Number of huge pages to purge in current batch */
size_t item_cnt;
/* Number of ranges to purge in current batch */
size_t nranges;
/* Total number of dirty pages in current batch*/
size_t ndirty_in_batch;
/* Max number of huge pages to purge */
size_t max_hp;
/*
* Once we are above this watermark we should not add more pages
* to the same batch. This is because while we want to minimize
* number of madvise calls we also do not want to be preventing
* allocations from too many huge pages (which we have to do
* while they are being purged)
*/
size_t range_watermark;
size_t npurged_hp_total;
};
#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */

View file

@ -12,6 +12,7 @@
/* Data. */
size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
/* This option is intended for kernel tuning, not app tuning. */
size_t opt_process_madvise_max_batch =
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
PROCESS_MADVISE_MAX_BATCH_DEFAULT;

238
src/hpa.c
View file

@ -423,6 +423,31 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
return to_hugify != NULL || hpa_should_purge(tsdn, shard);
}
/*
* This is used for jemalloc internal tuning and may change in the
* future based on production traffic.
*
* This value protects two things:
* 1. Stack size
* 2. Number of huge pages that are being purged in a batch as
* we do not allow allocations while making *madvise
* syscall.
*/
#define HPA_PURGE_BATCH_MAX_DEFAULT 16
#ifndef JEMALLOC_JET
#define HPA_PURGE_BATCH_MAX HPA_PURGE_BATCH_MAX_DEFAULT
#else
size_t hpa_purge_max_batch_size_for_test = HPA_PURGE_BATCH_MAX_DEFAULT;
size_t
hpa_purge_max_batch_size_for_test_set(size_t new_size) {
size_t old_size = hpa_purge_max_batch_size_for_test;
hpa_purge_max_batch_size_for_test = new_size;
return old_size;
}
#define HPA_PURGE_BATCH_MAX hpa_purge_max_batch_size_for_test
#endif
static inline size_t
hpa_process_madvise_max_iovec_len(void) {
assert(opt_process_madvise_max_batch <=
@ -431,14 +456,48 @@ hpa_process_madvise_max_iovec_len(void) {
HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch;
}
/* Returns whether or not we purged anything. */
static bool
hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
malloc_mutex_assert_owner(tsdn, &shard->mtx);
static inline void
hpa_purge_actual_unlocked(hpa_shard_t *shard, hpa_purge_item_t *batch,
size_t batch_sz) {
assert(batch_sz > 0);
hpdata_t *to_purge = psset_pick_purge(&shard->psset);
size_t len = hpa_process_madvise_max_iovec_len();
VARIABLE_ARRAY(hpa_io_vector_t, vec, len);
hpa_range_accum_t accum;
hpa_range_accum_init(&accum, vec, len);
for (size_t i = 0; i < batch_sz; ++i) {
hpdata_t *to_purge = batch[i].hp;
/* Actually do the purging, now that the lock is dropped. */
if (batch[i].dehugify) {
shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
HUGEPAGE);
}
void *purge_addr;
size_t purge_size;
size_t total_purged_on_one_hp = 0;
while (hpdata_purge_next(
to_purge, &batch[i].state, &purge_addr, &purge_size)) {
total_purged_on_one_hp += purge_size;
assert(total_purged_on_one_hp <= HUGEPAGE);
hpa_range_accum_add(&accum, purge_addr, purge_size, shard);
}
}
hpa_range_accum_finish(&accum, shard);
}
/* Prepare purge of one page. Return num of dirty regular pages on it
* Return 0 if no purgable huge page is found
*
* If there was a page to purge its purge state is initialized
*/
static inline size_t
hpa_purge_start_hp(hpa_purge_batch_t *b, psset_t *psset) {
hpdata_t *to_purge = psset_pick_purge(psset);
if (to_purge == NULL) {
return false;
return 0;
}
assert(hpdata_purge_allowed_get(to_purge));
assert(!hpdata_changing_state_get(to_purge));
@ -448,7 +507,7 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
* we're purging it (allocations and deallocations are
* OK).
*/
psset_update_begin(&shard->psset, to_purge);
psset_update_begin(psset, to_purge);
assert(hpdata_alloc_allowed_get(to_purge));
hpdata_mid_purge_set(to_purge, true);
hpdata_purge_allowed_set(to_purge, false);
@ -461,70 +520,115 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
* (clearing out user data).
*/
hpdata_alloc_allowed_set(to_purge, false);
psset_update_end(&shard->psset, to_purge);
psset_update_end(psset, to_purge);
assert(b->item_cnt < b->items_capacity);
hpa_purge_item_t *hp_item = &b->items[b->item_cnt];
b->item_cnt++;
hp_item->hp = to_purge;
/* Gather all the metadata we'll need during the purge. */
bool dehugify = hpdata_huge_get(to_purge);
hp_item->dehugify = hpdata_huge_get(hp_item->hp);
size_t nranges;
hpdata_purge_state_t purge_state;
size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state, &nranges);
(void) nranges; /*not used yet */
size_t ndirty =
hpdata_purge_begin(hp_item->hp, &hp_item->state, &nranges);
/* We picked hp to purge, so it should have some dirty ranges */
assert(ndirty > 0 && nranges >0);
b->ndirty_in_batch += ndirty;
b->nranges += nranges;
return ndirty;
}
shard->npending_purge += num_to_purge;
malloc_mutex_unlock(tsdn, &shard->mtx);
/* Actually do the purging, now that the lock is dropped. */
if (dehugify) {
shard->central->hooks.dehugify(hpdata_addr_get(to_purge),
HUGEPAGE);
}
size_t total_purged = 0;
uint64_t purges_this_pass = 0;
size_t len = hpa_process_madvise_max_iovec_len();
VARIABLE_ARRAY(hpa_io_vector_t, vec, len);
hpa_range_accum_t accum;
hpa_range_accum_init(&accum, vec, len);
void *purge_addr;
size_t purge_size;
while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
&purge_size)) {
total_purged += purge_size;
assert(total_purged <= HUGEPAGE);
hpa_range_accum_add(&accum, purge_addr, purge_size, shard);
purges_this_pass++;
}
/* If batch was not full, finish */
hpa_range_accum_finish(&accum, shard);
malloc_mutex_lock(tsdn, &shard->mtx);
/* The shard updates */
shard->npending_purge -= num_to_purge;
shard->stats.npurge_passes++;
shard->stats.npurges += purges_this_pass;
shard->central->hooks.curtime(&shard->last_purge,
/* first_reading */ false);
if (dehugify) {
/* Finish purge of one huge page. */
static inline void
hpa_purge_finish_hp(tsdn_t *tsdn, hpa_shard_t *shard,
hpa_purge_item_t *hp_item) {
if (hp_item->dehugify) {
shard->stats.ndehugifies++;
}
/* The hpdata updates. */
psset_update_begin(&shard->psset, to_purge);
if (dehugify) {
hpdata_dehugify(to_purge);
psset_update_begin(&shard->psset, hp_item->hp);
if (hp_item->dehugify) {
hpdata_dehugify(hp_item->hp);
}
hpdata_purge_end(to_purge, &purge_state);
hpdata_mid_purge_set(to_purge, false);
hpdata_purge_end(hp_item->hp, &hp_item->state);
hpdata_mid_purge_set(hp_item->hp, false);
hpdata_alloc_allowed_set(to_purge, true);
hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge);
hpdata_alloc_allowed_set(hp_item->hp, true);
hpa_update_purge_hugify_eligibility(tsdn, shard, hp_item->hp);
psset_update_end(&shard->psset, to_purge);
psset_update_end(&shard->psset, hp_item->hp);
}
return true;
static inline bool
hpa_batch_full(hpa_purge_batch_t *b) {
/* It's okay for ranges to go above */
return b->npurged_hp_total == b->max_hp ||
b->item_cnt == b->items_capacity ||
b->nranges >= b->range_watermark;
}
static inline void
hpa_batch_pass_start(hpa_purge_batch_t *b) {
b->item_cnt = 0;
b->nranges = 0;
b->ndirty_in_batch = 0;
}
static inline bool
hpa_batch_empty(hpa_purge_batch_t *b) {
return b->item_cnt == 0;
}
/* Returns number of huge pages purged. */
static inline size_t
hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) {
malloc_mutex_assert_owner(tsdn, &shard->mtx);
assert(max_hp > 0);
assert(HPA_PURGE_BATCH_MAX > 0);
assert(HPA_PURGE_BATCH_MAX <
(VARIABLE_ARRAY_SIZE_MAX / sizeof(hpa_purge_item_t)));
VARIABLE_ARRAY(hpa_purge_item_t, items, HPA_PURGE_BATCH_MAX);
hpa_purge_batch_t batch = {
.max_hp = max_hp,
.npurged_hp_total = 0,
.items = &items[0],
.items_capacity = HPA_PURGE_BATCH_MAX,
.range_watermark = hpa_process_madvise_max_iovec_len(),
};
assert(batch.range_watermark > 0);
while (1) {
hpa_batch_pass_start(&batch);
assert(hpa_batch_empty(&batch));
while(!hpa_batch_full(&batch) && hpa_should_purge(tsdn, shard)) {
size_t ndirty = hpa_purge_start_hp(&batch, &shard->psset);
if (ndirty == 0) {
break;
}
shard->npending_purge += ndirty;
batch.npurged_hp_total++;
}
if (hpa_batch_empty(&batch)) {
break;
}
malloc_mutex_unlock(tsdn, &shard->mtx);
hpa_purge_actual_unlocked(shard, batch.items, batch.item_cnt);
malloc_mutex_lock(tsdn, &shard->mtx);
/* The shard updates */
shard->npending_purge -= batch.ndirty_in_batch;
shard->stats.npurges += batch.ndirty_in_batch;
shard->central->hooks.curtime(&shard->last_purge,
/* first_reading */ false);
for (size_t i=0; i<batch.item_cnt; ++i) {
hpa_purge_finish_hp(tsdn, shard, &batch.items[i]);
}
}
malloc_mutex_assert_owner(tsdn, &shard->mtx);
shard->stats.npurge_passes++;
return batch.npurged_hp_total;
}
/* Returns whether or not we hugified anything. */
@ -654,19 +758,9 @@ hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
max_purges = max_purge_nhp;
}
while (hpa_should_purge(tsdn, shard) && nops < max_purges) {
if (!hpa_try_purge(tsdn, shard)) {
/*
* It is fine if we couldn't purge as sometimes
* we try to purge just to unblock
* hugification, but there is maybe no dirty
* pages at all at the moment.
*/
break;
}
malloc_mutex_assert_owner(tsdn, &shard->mtx);
nops++;
}
nops += hpa_purge(tsdn, shard, max_purges);
malloc_mutex_assert_owner(tsdn, &shard->mtx);
}
/*

View file

@ -237,15 +237,86 @@ TEST_BEGIN(test_more_regions_purged_from_one_page) {
expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/* We purge from 2 huge pages, each one 3 segments. That's 6 non
* vectorized calls, or 2 <= vc <=6 vectorized calls
* (depending on batch size).
/* We purge from 2 huge pages, each one 3 dirty continous segments.
* For opt_process_madvise_max_batch = 2, that is
* 2 calls for first page, and 2 calls for second as we don't
* want to hold the lock on the second page while vectorized batch
* of size 2 is already filled with the first one.
*/
size_t nexpected = 2 * (1 + (3 - 1) / opt_process_madvise_max_batch);
expect_zu_eq(4, ndefer_vec_purge_calls, "Expect purge");
expect_zu_eq(0, ndefer_purge_calls, "Expect no non-vec purge");
ndefer_vec_purge_calls = 0;
destroy_test_data(shard);
}
TEST_END
size_t
hpa_purge_max_batch_size_for_test_set(size_t new_size);
TEST_BEGIN(test_more_pages_than_batch_page_size) {
test_skip_if(!hpa_supported() ||
(opt_process_madvise_max_batch == 0) ||
HUGEPAGE_PAGES <= 4);
size_t old_page_batch = hpa_purge_max_batch_size_for_test_set(1);
hpa_hooks_t hooks;
hooks.map = &defer_test_map;
hooks.unmap = &defer_test_unmap;
hooks.purge = &defer_test_purge;
hooks.hugify = &defer_test_hugify;
hooks.dehugify = &defer_test_dehugify;
hooks.curtime = &defer_test_curtime;
hooks.ms_since = &defer_test_ms_since;
hooks.vectorized_purge = &defer_vectorized_purge;
hpa_shard_opts_t opts = test_hpa_shard_opts_default;
opts.deferral_allowed = true;
opts.min_purge_interval_ms = 0;
ndefer_vec_purge_calls = 0;
ndefer_purge_calls = 0;
hpa_shard_t *shard = create_test_data(&hooks, &opts);
bool deferred_work_generated = false;
nstime_init(&defer_curtime, 0);
tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
enum {NALLOCS = 8 * HUGEPAGE_PAGES};
edata_t *edatas[NALLOCS];
for (int i = 0; i < NALLOCS; i++) {
edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
false, false, &deferred_work_generated);
expect_ptr_not_null(edatas[i], "Unexpected null edata");
}
for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) {
pai_dalloc(tsdn, &shard->pai, edatas[i],
&deferred_work_generated);
}
hpa_shard_do_deferred_work(tsdn, shard);
/*
* Strict minimum purge interval is not set, we should purge as long as
* we have dirty pages.
*/
expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/* We have page batch size = 1.
* we have 5 * HP active pages, 3 * HP dirty pages
* To achieve the balance of 25% max dirty we need to
* purge 2 pages. Since batch is 1 that must be 2 calls
* no matter what opt_process_madvise_max_batch is
*/
size_t nexpected = 2;
expect_zu_eq(nexpected, ndefer_vec_purge_calls, "Expect purge");
expect_zu_eq(0, ndefer_purge_calls, "Expect no non-vec purge");
ndefer_vec_purge_calls = 0;
hpa_purge_max_batch_size_for_test_set(old_page_batch);
destroy_test_data(shard);
}
TEST_END
@ -254,5 +325,6 @@ int
main(void) {
return test_no_reentrancy(
test_vectorized_failure_fallback,
test_more_regions_purged_from_one_page);
test_more_regions_purged_from_one_page,
test_more_pages_than_batch_page_size);
}

View file

@ -0,0 +1,199 @@
#include "test/jemalloc_test.h"
#include "jemalloc/internal/hpa.h"
#include "jemalloc/internal/nstime.h"
#define SHARD_IND 111
#define ALLOC_MAX (HUGEPAGE)
typedef struct test_data_s test_data_t;
struct test_data_s {
/*
* Must be the first member -- we convert back and forth between the
* test_data_t and the hpa_shard_t;
*/
hpa_shard_t shard;
hpa_central_t central;
base_t *base;
edata_cache_t shard_edata_cache;
emap_t emap;
};
static hpa_shard_opts_t test_hpa_shard_opts_default = {
/* slab_max_alloc */
ALLOC_MAX,
/* hugification_threshold */
HUGEPAGE,
/* dirty_mult */
FXP_INIT_PERCENT(25),
/* deferral_allowed */
false,
/* hugify_delay_ms */
10 * 1000,
/* hugify_sync */
false,
/* min_purge_interval_ms */
5 * 1000,
/* experimental_max_purge_nhp */
-1,
/* peak_demand_window_ms */
0
};
static hpa_shard_t *
create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) {
bool err;
base_t *base = base_new(TSDN_NULL, /* ind */ SHARD_IND,
&ehooks_default_extent_hooks, /* metadata_use_hooks */ true);
assert_ptr_not_null(base, "");
test_data_t *test_data = malloc(sizeof(test_data_t));
assert_ptr_not_null(test_data, "");
test_data->base = base;
err = edata_cache_init(&test_data->shard_edata_cache, base);
assert_false(err, "");
err = emap_init(&test_data->emap, test_data->base, /* zeroed */ false);
assert_false(err, "");
err = hpa_central_init(&test_data->central, test_data->base, hooks);
assert_false(err, "");
err = hpa_shard_init(&test_data->shard, &test_data->central,
&test_data->emap, test_data->base, &test_data->shard_edata_cache,
SHARD_IND, opts);
assert_false(err, "");
return (hpa_shard_t *)test_data;
}
static void
destroy_test_data(hpa_shard_t *shard) {
test_data_t *test_data = (test_data_t *)shard;
base_delete(TSDN_NULL, test_data->base);
free(test_data);
}
static uintptr_t defer_bump_ptr = HUGEPAGE * 123;
static void *
defer_test_map(size_t size) {
void *result = (void *)defer_bump_ptr;
defer_bump_ptr += size;
return result;
}
static void
defer_test_unmap(void *ptr, size_t size) {
(void)ptr;
(void)size;
}
static size_t ndefer_purge_calls = 0;
static void
defer_test_purge(void *ptr, size_t size) {
(void)ptr;
(void)size;
++ndefer_purge_calls;
}
static size_t ndefer_vec_purge_calls = 0;
static bool
defer_vectorized_purge(void *vec, size_t vlen, size_t nbytes) {
(void)vec;
(void)nbytes;
++ndefer_vec_purge_calls;
return false;
}
static size_t ndefer_hugify_calls = 0;
static bool
defer_test_hugify(void *ptr, size_t size, bool sync) {
++ndefer_hugify_calls;
return false;
}
static size_t ndefer_dehugify_calls = 0;
static void
defer_test_dehugify(void *ptr, size_t size) {
++ndefer_dehugify_calls;
}
static nstime_t defer_curtime;
static void
defer_test_curtime(nstime_t *r_time, bool first_reading) {
*r_time = defer_curtime;
}
static uint64_t
defer_test_ms_since(nstime_t *past_time) {
return (nstime_ns(&defer_curtime) - nstime_ns(past_time)) / 1000 / 1000;
}
TEST_BEGIN(test_vectorized_purge) {
test_skip_if(!hpa_supported() ||
opt_process_madvise_max_batch == 0 || HUGEPAGE_PAGES <= 4);
assert(opt_process_madvise_max_batch == 64);
hpa_hooks_t hooks;
hooks.map = &defer_test_map;
hooks.unmap = &defer_test_unmap;
hooks.purge = &defer_test_purge;
hooks.hugify = &defer_test_hugify;
hooks.dehugify = &defer_test_dehugify;
hooks.curtime = &defer_test_curtime;
hooks.ms_since = &defer_test_ms_since;
hooks.vectorized_purge = &defer_vectorized_purge;
hpa_shard_opts_t opts = test_hpa_shard_opts_default;
opts.deferral_allowed = true;
opts.min_purge_interval_ms = 0;
ndefer_vec_purge_calls = 0;
ndefer_purge_calls = 0;
hpa_shard_t *shard = create_test_data(&hooks, &opts);
bool deferred_work_generated = false;
nstime_init(&defer_curtime, 0);
tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
enum {NALLOCS = 8 * HUGEPAGE_PAGES};
edata_t *edatas[NALLOCS];
for (int i = 0; i < NALLOCS; i++) {
edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
false, false, &deferred_work_generated);
expect_ptr_not_null(edatas[i], "Unexpected null edata");
}
/* Deallocate almost 3 hugepages out of 8, and to force batching
* leave the 2nd and 4th PAGE in the first 3 hugepages.
*/
for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) {
int j = i % HUGEPAGE_PAGES;
if (j != 1 && j != 3) {
pai_dalloc(tsdn, &shard->pai, edatas[i],
&deferred_work_generated);
}
}
hpa_shard_do_deferred_work(tsdn, shard);
/*
* We purge from 2 huge pages, each one 3 dirty continous segments.
* For opt_process_madvise_max_batch = 64, that is all just one call
*/
expect_zu_eq(1, ndefer_vec_purge_calls, "Expect single purge");
ndefer_vec_purge_calls = 0;
destroy_test_data(shard);
}
TEST_END
int
main(void) {
return test_no_reentrancy(
test_vectorized_purge);
}

View file

@ -0,0 +1,3 @@
#!/bin/sh
export MALLOC_CONF="process_madvise_max_batch:64"