diff --git a/Makefile.in b/Makefile.in index ee3399ec..ac8c51ff 100644 --- a/Makefile.in +++ b/Makefile.in @@ -232,6 +232,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/hook.c \ $(srcroot)test/unit/hpa.c \ $(srcroot)test/unit/hpa_vectorized_madvise.c \ + $(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \ $(srcroot)test/unit/hpa_background_thread.c \ $(srcroot)test/unit/hpdata.c \ $(srcroot)test/unit/huge.c \ diff --git a/include/jemalloc/internal/hpa_utils.h b/include/jemalloc/internal/hpa_utils.h index 035d3b21..283510b9 100644 --- a/include/jemalloc/internal/hpa_utils.h +++ b/include/jemalloc/internal/hpa_utils.h @@ -79,4 +79,38 @@ hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_shard_t *shard) { } } +/* + * For purging more than one page we use batch of these items + */ +typedef struct { + hpdata_purge_state_t state; + hpdata_t *hp; + bool dehugify; +} hpa_purge_item_t; + +typedef struct hpa_purge_batch_s hpa_purge_batch_t; +struct hpa_purge_batch_s { + hpa_purge_item_t *items; + size_t items_capacity; + /* Number of huge pages to purge in current batch */ + size_t item_cnt; + /* Number of ranges to purge in current batch */ + size_t nranges; + /* Total number of dirty pages in current batch*/ + size_t ndirty_in_batch; + + /* Max number of huge pages to purge */ + size_t max_hp; + /* + * Once we are above this watermark we should not add more pages + * to the same batch. This is because while we want to minimize + * number of madvise calls we also do not want to be preventing + * allocations from too many huge pages (which we have to do + * while they are being purged) + */ + size_t range_watermark; + + size_t npurged_hp_total; +}; + #endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */ diff --git a/src/extent.c b/src/extent.c index 86b30f82..3425e1ce 100644 --- a/src/extent.c +++ b/src/extent.c @@ -12,6 +12,7 @@ /* Data. */ size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT; +/* This option is intended for kernel tuning, not app tuning. */ size_t opt_process_madvise_max_batch = #ifdef JEMALLOC_HAVE_PROCESS_MADVISE PROCESS_MADVISE_MAX_BATCH_DEFAULT; diff --git a/src/hpa.c b/src/hpa.c index afcfbe7f..50614e42 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -423,6 +423,31 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { return to_hugify != NULL || hpa_should_purge(tsdn, shard); } +/* + * This is used for jemalloc internal tuning and may change in the + * future based on production traffic. + * + * This value protects two things: + * 1. Stack size + * 2. Number of huge pages that are being purged in a batch as + * we do not allow allocations while making *madvise + * syscall. + */ +#define HPA_PURGE_BATCH_MAX_DEFAULT 16 + +#ifndef JEMALLOC_JET +#define HPA_PURGE_BATCH_MAX HPA_PURGE_BATCH_MAX_DEFAULT +#else +size_t hpa_purge_max_batch_size_for_test = HPA_PURGE_BATCH_MAX_DEFAULT; +size_t +hpa_purge_max_batch_size_for_test_set(size_t new_size) { + size_t old_size = hpa_purge_max_batch_size_for_test; + hpa_purge_max_batch_size_for_test = new_size; + return old_size; +} +#define HPA_PURGE_BATCH_MAX hpa_purge_max_batch_size_for_test +#endif + static inline size_t hpa_process_madvise_max_iovec_len(void) { assert(opt_process_madvise_max_batch <= @@ -431,14 +456,48 @@ hpa_process_madvise_max_iovec_len(void) { HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch; } -/* Returns whether or not we purged anything. */ -static bool -hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { - malloc_mutex_assert_owner(tsdn, &shard->mtx); +static inline void +hpa_purge_actual_unlocked(hpa_shard_t *shard, hpa_purge_item_t *batch, + size_t batch_sz) { + assert(batch_sz > 0); - hpdata_t *to_purge = psset_pick_purge(&shard->psset); + size_t len = hpa_process_madvise_max_iovec_len(); + VARIABLE_ARRAY(hpa_io_vector_t, vec, len); + + hpa_range_accum_t accum; + hpa_range_accum_init(&accum, vec, len); + + for (size_t i = 0; i < batch_sz; ++i) { + hpdata_t *to_purge = batch[i].hp; + + /* Actually do the purging, now that the lock is dropped. */ + if (batch[i].dehugify) { + shard->central->hooks.dehugify(hpdata_addr_get(to_purge), + HUGEPAGE); + } + void *purge_addr; + size_t purge_size; + size_t total_purged_on_one_hp = 0; + while (hpdata_purge_next( + to_purge, &batch[i].state, &purge_addr, &purge_size)) { + total_purged_on_one_hp += purge_size; + assert(total_purged_on_one_hp <= HUGEPAGE); + hpa_range_accum_add(&accum, purge_addr, purge_size, shard); + } + } + hpa_range_accum_finish(&accum, shard); +} + +/* Prepare purge of one page. Return num of dirty regular pages on it + * Return 0 if no purgable huge page is found + * + * If there was a page to purge its purge state is initialized + */ +static inline size_t +hpa_purge_start_hp(hpa_purge_batch_t *b, psset_t *psset) { + hpdata_t *to_purge = psset_pick_purge(psset); if (to_purge == NULL) { - return false; + return 0; } assert(hpdata_purge_allowed_get(to_purge)); assert(!hpdata_changing_state_get(to_purge)); @@ -448,7 +507,7 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { * we're purging it (allocations and deallocations are * OK). */ - psset_update_begin(&shard->psset, to_purge); + psset_update_begin(psset, to_purge); assert(hpdata_alloc_allowed_get(to_purge)); hpdata_mid_purge_set(to_purge, true); hpdata_purge_allowed_set(to_purge, false); @@ -461,70 +520,115 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { * (clearing out user data). */ hpdata_alloc_allowed_set(to_purge, false); - psset_update_end(&shard->psset, to_purge); + psset_update_end(psset, to_purge); + assert(b->item_cnt < b->items_capacity); + hpa_purge_item_t *hp_item = &b->items[b->item_cnt]; + b->item_cnt++; + hp_item->hp = to_purge; /* Gather all the metadata we'll need during the purge. */ - bool dehugify = hpdata_huge_get(to_purge); + hp_item->dehugify = hpdata_huge_get(hp_item->hp); size_t nranges; - hpdata_purge_state_t purge_state; - size_t num_to_purge = hpdata_purge_begin(to_purge, &purge_state, &nranges); - (void) nranges; /*not used yet */ + size_t ndirty = + hpdata_purge_begin(hp_item->hp, &hp_item->state, &nranges); + /* We picked hp to purge, so it should have some dirty ranges */ + assert(ndirty > 0 && nranges >0); + b->ndirty_in_batch += ndirty; + b->nranges += nranges; + return ndirty; +} - shard->npending_purge += num_to_purge; - - malloc_mutex_unlock(tsdn, &shard->mtx); - - /* Actually do the purging, now that the lock is dropped. */ - if (dehugify) { - shard->central->hooks.dehugify(hpdata_addr_get(to_purge), - HUGEPAGE); - } - size_t total_purged = 0; - uint64_t purges_this_pass = 0; - - size_t len = hpa_process_madvise_max_iovec_len(); - VARIABLE_ARRAY(hpa_io_vector_t, vec, len); - - hpa_range_accum_t accum; - hpa_range_accum_init(&accum, vec, len); - - void *purge_addr; - size_t purge_size; - while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, - &purge_size)) { - total_purged += purge_size; - assert(total_purged <= HUGEPAGE); - hpa_range_accum_add(&accum, purge_addr, purge_size, shard); - purges_this_pass++; - } - /* If batch was not full, finish */ - hpa_range_accum_finish(&accum, shard); - - malloc_mutex_lock(tsdn, &shard->mtx); - /* The shard updates */ - shard->npending_purge -= num_to_purge; - shard->stats.npurge_passes++; - shard->stats.npurges += purges_this_pass; - shard->central->hooks.curtime(&shard->last_purge, - /* first_reading */ false); - if (dehugify) { +/* Finish purge of one huge page. */ +static inline void +hpa_purge_finish_hp(tsdn_t *tsdn, hpa_shard_t *shard, + hpa_purge_item_t *hp_item) { + if (hp_item->dehugify) { shard->stats.ndehugifies++; } - /* The hpdata updates. */ - psset_update_begin(&shard->psset, to_purge); - if (dehugify) { - hpdata_dehugify(to_purge); + psset_update_begin(&shard->psset, hp_item->hp); + if (hp_item->dehugify) { + hpdata_dehugify(hp_item->hp); } - hpdata_purge_end(to_purge, &purge_state); - hpdata_mid_purge_set(to_purge, false); + hpdata_purge_end(hp_item->hp, &hp_item->state); + hpdata_mid_purge_set(hp_item->hp, false); - hpdata_alloc_allowed_set(to_purge, true); - hpa_update_purge_hugify_eligibility(tsdn, shard, to_purge); + hpdata_alloc_allowed_set(hp_item->hp, true); + hpa_update_purge_hugify_eligibility(tsdn, shard, hp_item->hp); - psset_update_end(&shard->psset, to_purge); + psset_update_end(&shard->psset, hp_item->hp); +} - return true; +static inline bool +hpa_batch_full(hpa_purge_batch_t *b) { + /* It's okay for ranges to go above */ + return b->npurged_hp_total == b->max_hp || + b->item_cnt == b->items_capacity || + b->nranges >= b->range_watermark; +} + +static inline void +hpa_batch_pass_start(hpa_purge_batch_t *b) { + b->item_cnt = 0; + b->nranges = 0; + b->ndirty_in_batch = 0; +} + +static inline bool +hpa_batch_empty(hpa_purge_batch_t *b) { + return b->item_cnt == 0; +} + +/* Returns number of huge pages purged. */ +static inline size_t +hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + assert(max_hp > 0); + + assert(HPA_PURGE_BATCH_MAX > 0); + assert(HPA_PURGE_BATCH_MAX < + (VARIABLE_ARRAY_SIZE_MAX / sizeof(hpa_purge_item_t))); + VARIABLE_ARRAY(hpa_purge_item_t, items, HPA_PURGE_BATCH_MAX); + hpa_purge_batch_t batch = { + .max_hp = max_hp, + .npurged_hp_total = 0, + .items = &items[0], + .items_capacity = HPA_PURGE_BATCH_MAX, + .range_watermark = hpa_process_madvise_max_iovec_len(), + }; + assert(batch.range_watermark > 0); + + while (1) { + hpa_batch_pass_start(&batch); + assert(hpa_batch_empty(&batch)); + while(!hpa_batch_full(&batch) && hpa_should_purge(tsdn, shard)) { + size_t ndirty = hpa_purge_start_hp(&batch, &shard->psset); + if (ndirty == 0) { + break; + } + shard->npending_purge += ndirty; + batch.npurged_hp_total++; + } + + if (hpa_batch_empty(&batch)) { + break; + } + malloc_mutex_unlock(tsdn, &shard->mtx); + hpa_purge_actual_unlocked(shard, batch.items, batch.item_cnt); + malloc_mutex_lock(tsdn, &shard->mtx); + + /* The shard updates */ + shard->npending_purge -= batch.ndirty_in_batch; + shard->stats.npurges += batch.ndirty_in_batch; + shard->central->hooks.curtime(&shard->last_purge, + /* first_reading */ false); + for (size_t i=0; imtx); + shard->stats.npurge_passes++; + return batch.npurged_hp_total; } /* Returns whether or not we hugified anything. */ @@ -654,19 +758,9 @@ hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, max_purges = max_purge_nhp; } - while (hpa_should_purge(tsdn, shard) && nops < max_purges) { - if (!hpa_try_purge(tsdn, shard)) { - /* - * It is fine if we couldn't purge as sometimes - * we try to purge just to unblock - * hugification, but there is maybe no dirty - * pages at all at the moment. - */ - break; - } - malloc_mutex_assert_owner(tsdn, &shard->mtx); - nops++; - } + malloc_mutex_assert_owner(tsdn, &shard->mtx); + nops += hpa_purge(tsdn, shard, max_purges); + malloc_mutex_assert_owner(tsdn, &shard->mtx); } /* diff --git a/test/unit/hpa_vectorized_madvise.c b/test/unit/hpa_vectorized_madvise.c index 130dc699..ae25fdde 100644 --- a/test/unit/hpa_vectorized_madvise.c +++ b/test/unit/hpa_vectorized_madvise.c @@ -237,15 +237,86 @@ TEST_BEGIN(test_more_regions_purged_from_one_page) { expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early"); expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early"); - /* We purge from 2 huge pages, each one 3 segments. That's 6 non - * vectorized calls, or 2 <= vc <=6 vectorized calls - * (depending on batch size). + /* We purge from 2 huge pages, each one 3 dirty continous segments. + * For opt_process_madvise_max_batch = 2, that is + * 2 calls for first page, and 2 calls for second as we don't + * want to hold the lock on the second page while vectorized batch + * of size 2 is already filled with the first one. */ - size_t nexpected = 2 * (1 + (3 - 1) / opt_process_madvise_max_batch); + expect_zu_eq(4, ndefer_vec_purge_calls, "Expect purge"); + expect_zu_eq(0, ndefer_purge_calls, "Expect no non-vec purge"); + ndefer_vec_purge_calls = 0; + + destroy_test_data(shard); +} +TEST_END + +size_t +hpa_purge_max_batch_size_for_test_set(size_t new_size); +TEST_BEGIN(test_more_pages_than_batch_page_size) { + test_skip_if(!hpa_supported() || + (opt_process_madvise_max_batch == 0) || + HUGEPAGE_PAGES <= 4); + + size_t old_page_batch = hpa_purge_max_batch_size_for_test_set(1); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_default; + opts.deferral_allowed = true; + opts.min_purge_interval_ms = 0; + ndefer_vec_purge_calls = 0; + ndefer_purge_calls = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + + bool deferred_work_generated = false; + + nstime_init(&defer_curtime, 0); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + + enum {NALLOCS = 8 * HUGEPAGE_PAGES}; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) { + pai_dalloc(tsdn, &shard->pai, edatas[i], + &deferred_work_generated); + } + + hpa_shard_do_deferred_work(tsdn, shard); + + /* + * Strict minimum purge interval is not set, we should purge as long as + * we have dirty pages. + */ + expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early"); + expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early"); + + /* We have page batch size = 1. + * we have 5 * HP active pages, 3 * HP dirty pages + * To achieve the balance of 25% max dirty we need to + * purge 2 pages. Since batch is 1 that must be 2 calls + * no matter what opt_process_madvise_max_batch is + */ + size_t nexpected = 2; expect_zu_eq(nexpected, ndefer_vec_purge_calls, "Expect purge"); expect_zu_eq(0, ndefer_purge_calls, "Expect no non-vec purge"); ndefer_vec_purge_calls = 0; + hpa_purge_max_batch_size_for_test_set(old_page_batch); + destroy_test_data(shard); } TEST_END @@ -254,5 +325,6 @@ int main(void) { return test_no_reentrancy( test_vectorized_failure_fallback, - test_more_regions_purged_from_one_page); + test_more_regions_purged_from_one_page, + test_more_pages_than_batch_page_size); } diff --git a/test/unit/hpa_vectorized_madvise_large_batch.c b/test/unit/hpa_vectorized_madvise_large_batch.c new file mode 100644 index 00000000..99ce15f4 --- /dev/null +++ b/test/unit/hpa_vectorized_madvise_large_batch.c @@ -0,0 +1,199 @@ +#include "test/jemalloc_test.h" + +#include "jemalloc/internal/hpa.h" +#include "jemalloc/internal/nstime.h" + +#define SHARD_IND 111 + +#define ALLOC_MAX (HUGEPAGE) + +typedef struct test_data_s test_data_t; +struct test_data_s { + /* + * Must be the first member -- we convert back and forth between the + * test_data_t and the hpa_shard_t; + */ + hpa_shard_t shard; + hpa_central_t central; + base_t *base; + edata_cache_t shard_edata_cache; + + emap_t emap; +}; + +static hpa_shard_opts_t test_hpa_shard_opts_default = { + /* slab_max_alloc */ + ALLOC_MAX, + /* hugification_threshold */ + HUGEPAGE, + /* dirty_mult */ + FXP_INIT_PERCENT(25), + /* deferral_allowed */ + false, + /* hugify_delay_ms */ + 10 * 1000, + /* hugify_sync */ + false, + /* min_purge_interval_ms */ + 5 * 1000, + /* experimental_max_purge_nhp */ + -1, + /* peak_demand_window_ms */ + 0 +}; + +static hpa_shard_t * +create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { + bool err; + base_t *base = base_new(TSDN_NULL, /* ind */ SHARD_IND, + &ehooks_default_extent_hooks, /* metadata_use_hooks */ true); + assert_ptr_not_null(base, ""); + + test_data_t *test_data = malloc(sizeof(test_data_t)); + assert_ptr_not_null(test_data, ""); + + test_data->base = base; + + err = edata_cache_init(&test_data->shard_edata_cache, base); + assert_false(err, ""); + + err = emap_init(&test_data->emap, test_data->base, /* zeroed */ false); + assert_false(err, ""); + + err = hpa_central_init(&test_data->central, test_data->base, hooks); + assert_false(err, ""); + + err = hpa_shard_init(&test_data->shard, &test_data->central, + &test_data->emap, test_data->base, &test_data->shard_edata_cache, + SHARD_IND, opts); + assert_false(err, ""); + + return (hpa_shard_t *)test_data; +} + +static void +destroy_test_data(hpa_shard_t *shard) { + test_data_t *test_data = (test_data_t *)shard; + base_delete(TSDN_NULL, test_data->base); + free(test_data); +} + +static uintptr_t defer_bump_ptr = HUGEPAGE * 123; +static void * +defer_test_map(size_t size) { + void *result = (void *)defer_bump_ptr; + defer_bump_ptr += size; + return result; +} + +static void +defer_test_unmap(void *ptr, size_t size) { + (void)ptr; + (void)size; +} + +static size_t ndefer_purge_calls = 0; +static void +defer_test_purge(void *ptr, size_t size) { + (void)ptr; + (void)size; + ++ndefer_purge_calls; +} + +static size_t ndefer_vec_purge_calls = 0; +static bool +defer_vectorized_purge(void *vec, size_t vlen, size_t nbytes) { + (void)vec; + (void)nbytes; + ++ndefer_vec_purge_calls; + return false; +} + +static size_t ndefer_hugify_calls = 0; +static bool +defer_test_hugify(void *ptr, size_t size, bool sync) { + ++ndefer_hugify_calls; + return false; +} + +static size_t ndefer_dehugify_calls = 0; +static void +defer_test_dehugify(void *ptr, size_t size) { + ++ndefer_dehugify_calls; +} + +static nstime_t defer_curtime; +static void +defer_test_curtime(nstime_t *r_time, bool first_reading) { + *r_time = defer_curtime; +} + +static uint64_t +defer_test_ms_since(nstime_t *past_time) { + return (nstime_ns(&defer_curtime) - nstime_ns(past_time)) / 1000 / 1000; +} + +TEST_BEGIN(test_vectorized_purge) { + test_skip_if(!hpa_supported() || + opt_process_madvise_max_batch == 0 || HUGEPAGE_PAGES <= 4); + assert(opt_process_madvise_max_batch == 64); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_default; + opts.deferral_allowed = true; + opts.min_purge_interval_ms = 0; + ndefer_vec_purge_calls = 0; + ndefer_purge_calls = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + + bool deferred_work_generated = false; + + nstime_init(&defer_curtime, 0); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + + enum {NALLOCS = 8 * HUGEPAGE_PAGES}; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate almost 3 hugepages out of 8, and to force batching + * leave the 2nd and 4th PAGE in the first 3 hugepages. + */ + for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) { + int j = i % HUGEPAGE_PAGES; + if (j != 1 && j != 3) { + pai_dalloc(tsdn, &shard->pai, edatas[i], + &deferred_work_generated); + } + } + + hpa_shard_do_deferred_work(tsdn, shard); + + /* + * We purge from 2 huge pages, each one 3 dirty continous segments. + * For opt_process_madvise_max_batch = 64, that is all just one call + */ + expect_zu_eq(1, ndefer_vec_purge_calls, "Expect single purge"); + ndefer_vec_purge_calls = 0; + + destroy_test_data(shard); +} +TEST_END + +int +main(void) { + return test_no_reentrancy( + test_vectorized_purge); +} diff --git a/test/unit/hpa_vectorized_madvise_large_batch.sh b/test/unit/hpa_vectorized_madvise_large_batch.sh new file mode 100644 index 00000000..f996047f --- /dev/null +++ b/test/unit/hpa_vectorized_madvise_large_batch.sh @@ -0,0 +1,3 @@ +#!/bin/sh + +export MALLOC_CONF="process_madvise_max_batch:64"