diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h index 7a6ba0b9..131bbb90 100644 --- a/include/jemalloc/internal/hpa.h +++ b/include/jemalloc/internal/hpa.h @@ -147,6 +147,15 @@ struct hpa_shard_s { * Last time we performed purge on this shard. */ nstime_t last_purge; + + /* + * Last time when we attempted work (purging or hugifying). If deferral + * of the work is allowed (we have background thread), this is the time + * when background thread checked if purging or hugifying needs to be + * done. If deferral is not allowed, this is the time of (hpa_alloc or + * hpa_dalloc) activity in the shard. + */ + nstime_t last_time_work_attempted; }; bool hpa_hugepage_size_exceeds_limit(void); diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h index 9e7f76ac..6747c2db 100644 --- a/include/jemalloc/internal/hpa_opts.h +++ b/include/jemalloc/internal/hpa_opts.h @@ -7,8 +7,60 @@ /* * This file is morally part of hpa.h, but is split out for header-ordering * reasons. + * + * All of these hpa_shard_opts below are experimental. We are exploring more + * efficient packing, hugifying, and purging approaches to make efficient + * trade-offs between CPU, memory, latency, and usability. This means all of + * them are at the risk of being deprecated and corresponding configurations + * should be updated once the final version settles. */ +/* + * This enum controls how jemalloc hugifies/dehugifies pages. Each style may be + * more suitable depending on deployment environments. + * + * hpa_hugify_style_none + * Using this means that jemalloc will not be hugifying or dehugifying pages, + * but will let the kernel make those decisions. This style only makes sense + * when deploying on systems where THP are enabled in 'always' mode. With this + * style, you most likely want to have no purging at all (dirty_mult=-1) or + * purge_threshold=HUGEPAGE bytes (2097152 for 2Mb page), although other + * thresholds may work well depending on kernel settings of your deployment + * targets. + * + * hpa_hugify_style_eager + * This style results in jemalloc giving hugepage advice, if needed, to + * anonymous memory immediately after it is mapped, so huge pages can be backing + * that memory at page-fault time. This is usually more efficient than doing + * it later, and it allows us to benefit from the hugepages from the start. + * Same options for purging as for the style 'none' are good starting choices: + * no purging, or purge_threshold=HUGEPAGE, some min_purge_delay_ms that allows + * for page not to be purged quickly, etc. This is a good choice if you can + * afford extra memory and your application gets performance increase from + * transparent hughepages. + * + * hpa_hugify_style_lazy + * This style is suitable when you purge more aggressively (you sacrifice CPU + * performance for less memory). When this style is chosen, jemalloc will + * hugify once hugification_threshold is reached, and dehugify before purging. + * If the kernel is configured to use direct compaction you may experience some + * allocation latency when using this style. The best is to measure what works + * better for your application needs, and in the target deployment environment. + * This is a good choice for apps that cannot afford a lot of memory regression, + * but would still like to benefit from backing certain memory regions with + * hugepages. + */ +enum hpa_hugify_style_e { + hpa_hugify_style_auto = 0, + hpa_hugify_style_none = 1, + hpa_hugify_style_eager = 2, + hpa_hugify_style_lazy = 3, + hpa_hugify_style_limit = hpa_hugify_style_lazy + 1 +}; +typedef enum hpa_hugify_style_e hpa_hugify_style_t; + +extern const char *const hpa_hugify_style_names[]; + typedef struct hpa_shard_opts_s hpa_shard_opts_t; struct hpa_shard_opts_s { /* @@ -46,7 +98,8 @@ struct hpa_shard_opts_s { uint64_t hugify_delay_ms; /* - * Hugify pages synchronously. + * Hugify pages synchronously (hugify will happen even if hugify_style + * is not hpa_hugify_style_lazy). */ bool hugify_sync; @@ -59,6 +112,46 @@ struct hpa_shard_opts_s { * Maximum number of hugepages to purge on each purging attempt. */ ssize_t experimental_max_purge_nhp; + + /* + * Minimum number of inactive bytes needed for a non-empty page to be + * considered purgable. + * + * When the number of touched inactive bytes on non-empty hugepage is + * >= purge_threshold, the page is purgable. Empty pages are always + * purgable. Setting this to HUGEPAGE bytes would only purge empty + * pages if using hugify_style_eager and the purges would be exactly + * HUGEPAGE bytes. Depending on your kernel settings, this may result + * in better performance. + * + * Please note, when threshold is reached, we will purge all the dirty + * bytes, and not just up to the threshold. If this is PAGE bytes, then + * all the pages that have any dirty bytes are purgable. We treat + * purgability constraint for purge_threshold as stronger than + * dirty_mult, IOW, if no page meets purge_threshold, we will not purge + * even if we are above dirty_mult. + */ + size_t purge_threshold; + + /* + * Minimum number of ms that needs to elapse between HP page becoming + * eligible for purging and actually getting purged. + * + * Setting this to a larger number would give better chance of reusing + * that memory. Setting it to 0 means that page is eligible for purging + * as soon as it meets the purge_threshold. The clock resets when + * purgability of the page changes (page goes from being non-purgable to + * purgable). When using eager style you probably want to allow for + * some delay, to avoid purging the page too quickly and give it time to + * be used. + */ + uint64_t min_purge_delay_ms; + + /* + * Style of hugification/dehugification (see comment at + * hpa_hugify_style_t for options). + */ + hpa_hugify_style_t hugify_style; }; /* clang-format off */ @@ -84,7 +177,13 @@ struct hpa_shard_opts_s { /* min_purge_interval_ms */ \ 5 * 1000, \ /* experimental_max_purge_nhp */ \ - -1 \ + -1, \ + /* size_t purge_threshold */ \ + PAGE, \ + /* min_purge_delay_ms */ \ + 0, \ + /* hugify_style */ \ + hpa_hugify_style_lazy \ } /* clang-format on */ diff --git a/include/jemalloc/internal/hpdata.h b/include/jemalloc/internal/hpdata.h index 75550f9b..eb83c900 100644 --- a/include/jemalloc/internal/hpdata.h +++ b/include/jemalloc/internal/hpdata.h @@ -124,6 +124,12 @@ struct hpdata_s { /* The touched pages (using the same definition as above). */ fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)]; + + /* Time when this extent (hpdata) becomes eligible for purging */ + nstime_t h_time_purge_allowed; + + /* True if the extent was huge and empty last time when it was purged */ + bool h_purged_when_empty_and_huge; }; TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty) @@ -284,17 +290,17 @@ hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) { } static inline size_t -hpdata_nactive_get(hpdata_t *hpdata) { +hpdata_nactive_get(const hpdata_t *hpdata) { return hpdata->h_nactive; } static inline size_t -hpdata_ntouched_get(hpdata_t *hpdata) { +hpdata_ntouched_get(const hpdata_t *hpdata) { return hpdata->h_ntouched; } static inline size_t -hpdata_ndirty_get(hpdata_t *hpdata) { +hpdata_ndirty_get(const hpdata_t *hpdata) { return hpdata->h_ntouched - hpdata->h_nactive; } @@ -303,6 +309,26 @@ hpdata_nretained_get(hpdata_t *hpdata) { return HUGEPAGE_PAGES - hpdata->h_ntouched; } +static inline void +hpdata_time_purge_allowed_set(hpdata_t *hpdata, const nstime_t *v) { + nstime_copy(&hpdata->h_time_purge_allowed, v); +} + +static inline const nstime_t * +hpdata_time_purge_allowed_get(const hpdata_t *hpdata) { + return &hpdata->h_time_purge_allowed; +} + +static inline bool +hpdata_purged_when_empty_and_huge_get(const hpdata_t *hpdata) { + return hpdata->h_purged_when_empty_and_huge; +} + +static inline void +hpdata_purged_when_empty_and_huge_set(hpdata_t *hpdata, bool v) { + hpdata->h_purged_when_empty_and_huge = v; +} + static inline void hpdata_assert_empty(hpdata_t *hpdata) { assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES)); @@ -360,7 +386,7 @@ hpdata_full(const hpdata_t *hpdata) { return hpdata->h_nactive == HUGEPAGE_PAGES; } -void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age); +void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge); /* * Given an hpdata which can serve an allocation request, pick and reserve an diff --git a/include/jemalloc/internal/nstime.h b/include/jemalloc/internal/nstime.h index a10b2de1..0848b9d0 100644 --- a/include/jemalloc/internal/nstime.h +++ b/include/jemalloc/internal/nstime.h @@ -40,6 +40,8 @@ void nstime_isubtract(nstime_t *time, uint64_t subtrahend); void nstime_imultiply(nstime_t *time, uint64_t multiplier); void nstime_idivide(nstime_t *time, uint64_t divisor); uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor); +uint64_t nstime_ns_between(const nstime_t *earlier, const nstime_t *later); +uint64_t nstime_ms_between(const nstime_t *earlier, const nstime_t *later); uint64_t nstime_ns_since(const nstime_t *past); uint64_t nstime_ms_since(const nstime_t *past); @@ -67,7 +69,7 @@ nstime_init_zero(nstime_t *time) { } JEMALLOC_ALWAYS_INLINE bool -nstime_equals_zero(nstime_t *time) { +nstime_equals_zero(const nstime_t *time) { int diff = nstime_compare(time, &nstime_zero); assert(diff >= 0); return diff == 0; diff --git a/include/jemalloc/internal/psset.h b/include/jemalloc/internal/psset.h index 3fdecaed..f096e414 100644 --- a/include/jemalloc/internal/psset.h +++ b/include/jemalloc/internal/psset.h @@ -121,8 +121,12 @@ void psset_update_end(psset_t *psset, hpdata_t *ps); /* Analogous to the eset_fit; pick a hpdata to serve the request. */ hpdata_t *psset_pick_alloc(psset_t *psset, size_t size); -/* Pick one to purge. */ -hpdata_t *psset_pick_purge(psset_t *psset); +/* + * Pick one to purge that is purgable before given time (inclusive). If now + * is NULL then time is not considered. + */ +hpdata_t *psset_pick_purge(psset_t *psset, const nstime_t *now); + /* Pick one to hugify. */ hpdata_t *psset_pick_hugify(psset_t *psset); diff --git a/src/ctl.c b/src/ctl.c index a4c60ce0..85583bec 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -106,6 +106,9 @@ CTL_PROTO(opt_hpa_hugify_delay_ms) CTL_PROTO(opt_hpa_hugify_sync) CTL_PROTO(opt_hpa_min_purge_interval_ms) CTL_PROTO(opt_experimental_hpa_max_purge_nhp) +CTL_PROTO(opt_hpa_purge_threshold) +CTL_PROTO(opt_hpa_min_purge_delay_ms) +CTL_PROTO(opt_hpa_hugify_style) CTL_PROTO(opt_hpa_dirty_mult) CTL_PROTO(opt_hpa_sec_nshards) CTL_PROTO(opt_hpa_sec_max_alloc) @@ -469,6 +472,9 @@ static const ctl_named_node_t opt_node[] = {{NAME("abort"), CTL(opt_abort)}, {NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)}, {NAME("experimental_hpa_max_purge_nhp"), CTL(opt_experimental_hpa_max_purge_nhp)}, + {NAME("hpa_purge_threshold"), CTL(opt_hpa_purge_threshold)}, + {NAME("hpa_min_purge_delay_ms"), CTL(opt_hpa_min_purge_delay_ms)}, + {NAME("hpa_hugify_style"), CTL(opt_hpa_hugify_style)}, {NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)}, {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, @@ -2137,7 +2143,11 @@ CTL_RO_NL_GEN( opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms, uint64_t) CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp, opt_hpa_opts.experimental_max_purge_nhp, ssize_t) - +CTL_RO_NL_GEN(opt_hpa_purge_threshold, opt_hpa_opts.purge_threshold, size_t) +CTL_RO_NL_GEN( + opt_hpa_min_purge_delay_ms, opt_hpa_opts.min_purge_delay_ms, uint64_t) +CTL_RO_NL_GEN(opt_hpa_hugify_style, + hpa_hugify_style_names[opt_hpa_opts.hugify_style], const char *) /* * This will have to change before we publicly document this option; fxp_t and * its representation are internal implementation details. diff --git a/src/hpa.c b/src/hpa.c index 271b1af4..27db53a9 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -26,6 +26,8 @@ static void hpa_dalloc_batch(tsdn_t *tsdn, pai_t *self, edata_list_active_t *list, bool *deferred_work_generated); static uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, pai_t *self); +const char *const hpa_hugify_style_names[] = {"auto", "none", "eager", "lazy"}; + bool hpa_hugepage_size_exceeds_limit(void) { return HUGEPAGE > HUGEPAGE_MAX_EXPECTED_SIZE; @@ -97,7 +99,7 @@ hpa_alloc_ps(tsdn_t *tsdn, hpa_central_t *central) { static hpdata_t * hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, - uint64_t age, bool *oom) { + uint64_t age, bool hugify_eager, bool *oom) { /* Don't yet support big allocations; these should get filtered out. */ assert(size <= HUGEPAGE); /* @@ -120,7 +122,7 @@ hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); return NULL; } - hpdata_init(ps, central->eden, age); + hpdata_init(ps, central->eden, age, hugify_eager); central->eden = NULL; central->eden_len = 0; malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); @@ -133,22 +135,20 @@ hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, * allocate an edata_t for the new psset. */ if (central->eden == NULL) { - /* - * During development, we're primarily concerned with systems - * with overcommit. Eventually, we should be more careful here. - */ - bool commit = true; /* Allocate address space, bailing if we fail. */ - void *new_eden = pages_map( - NULL, HPA_EDEN_SIZE, HUGEPAGE, &commit); + void *new_eden = central->hooks.map(HPA_EDEN_SIZE); if (new_eden == NULL) { *oom = true; malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); return NULL; } + if (hugify_eager) { + central->hooks.hugify( + new_eden, HPA_EDEN_SIZE, /* sync */ false); + } ps = hpa_alloc_ps(tsdn, central); if (ps == NULL) { - pages_unmap(new_eden, HPA_EDEN_SIZE); + central->hooks.unmap(new_eden, HPA_EDEN_SIZE); *oom = true; malloc_mutex_unlock(tsdn, ¢ral->grow_mtx); return NULL; @@ -170,7 +170,7 @@ hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size, assert(central->eden_len % HUGEPAGE == 0); assert(HUGEPAGE_ADDR2BASE(central->eden) == central->eden); - hpdata_init(ps, central->eden, age); + hpdata_init(ps, central->eden, age, hugify_eager); char *eden_char = (char *)central->eden; eden_char += HUGEPAGE; @@ -213,6 +213,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, shard->npending_purge = 0; nstime_init_zero(&shard->last_purge); + nstime_init_zero(&shard->last_time_work_attempted); shard->stats.npurge_passes = 0; shard->stats.npurges = 0; @@ -274,6 +275,34 @@ hpa_shard_stats_merge( malloc_mutex_unlock(tsdn, &shard->grow_mtx); } +static bool +hpa_is_hugify_eager(hpa_shard_t *shard) { + return shard->opts.hugify_style == hpa_hugify_style_eager; +} + +static bool +hpa_is_hugify_lazy(hpa_shard_t *shard) { + /* When hugify_sync==true we also set/unset HG bit manually */ + return shard->opts.hugify_style == hpa_hugify_style_lazy + || shard->opts.hugify_sync; +} + +static bool +hpa_is_hugify_none(hpa_shard_t *shard) { + return shard->opts.hugify_style == hpa_hugify_style_none; +} + +/* + * Experimentation has shown that when we are purging only HUGEPAGE ranges and + * hugifying eagerly (or thp enabled=always) we get huge pages more often. This + * helps us have more realistic accounting. + */ +static bool +hpa_should_assume_huge(hpa_shard_t *shard, const hpdata_t *ps) { + return (hpa_is_hugify_eager(shard) || hpa_is_hugify_none(shard)) + && hpdata_purged_when_empty_and_huge_get(ps); +} + static bool hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { /* @@ -285,6 +314,20 @@ hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { >= shard->opts.hugification_threshold; } +static bool +hpa_good_purge_candidate(hpa_shard_t *shard, hpdata_t *ps) { + if (shard->opts.dirty_mult == (fxp_t)-1) { + /* No purging. */ + return false; + } + size_t ndirty = hpdata_ndirty_get(ps); + /* Empty pages are good candidate for purging. */ + if (ndirty > 0 && hpdata_empty(ps)) { + return true; + } + return ndirty * PAGE >= shard->opts.purge_threshold; +} + static size_t hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); @@ -316,6 +359,14 @@ hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { static bool hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); + /* + * The page that is purgable may be delayed, but we just want to know + * if there is a need for bg thread to wake up in the future. + */ + hpdata_t *ps = psset_pick_purge(&shard->psset, NULL); + if (ps == NULL) { + return false; + } if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { return true; } @@ -325,6 +376,20 @@ hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { return false; } +static void +hpa_assume_huge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + + assert(hpa_should_assume_huge(shard, ps)); + if (hpdata_huge_get(ps) || hpdata_empty(ps)) { + return; + } + + if (hpdata_ntouched_get(ps) != HUGEPAGE_PAGES) { + hpdata_hugify(ps); + } +} + static void hpa_update_purge_hugify_eligibility( tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { @@ -356,13 +421,28 @@ hpa_update_purge_hugify_eligibility( * allocator's end at all; we just try to pack allocations in a * hugepage-friendly manner and let the OS hugify in the background. */ - hpdata_purge_allowed_set(ps, hpdata_ndirty_get(ps) > 0); - if (hpa_good_hugification_candidate(shard, ps) + if (hpa_should_assume_huge(shard, ps)) { + /* Assume it is huge without the need to madvise */ + hpa_assume_huge(tsdn, shard, ps); + } + if (hpa_is_hugify_lazy(shard) + && hpa_good_hugification_candidate(shard, ps) && !hpdata_huge_get(ps)) { nstime_t now; shard->central->hooks.curtime(&now, /* first_reading */ true); hpdata_allow_hugify(ps, now); } + bool purgable = hpa_good_purge_candidate(shard, ps); + if (purgable && !hpdata_purge_allowed_get(ps) + && (shard->opts.min_purge_delay_ms > 0)) { + nstime_t now; + uint64_t delayns = shard->opts.min_purge_delay_ms * 1000 * 1000; + shard->central->hooks.curtime(&now, /* first_reading */ true); + nstime_iadd(&now, delayns); + hpdata_time_purge_allowed_set(ps, &now); + } + hpdata_purge_allowed_set(ps, purgable); + /* * Once a hugepage has become eligible for hugification, we don't mark * it as ineligible just because it stops meeting the criteria (this @@ -375,7 +455,7 @@ hpa_update_purge_hugify_eligibility( * empty; it definitely doesn't help there until the hugepage gets * reused, which is likely not for a while. */ - if (hpdata_nactive_get(ps) == 0) { + if (hpdata_nactive_get(ps) == 0 && !hpa_should_assume_huge(shard, ps)) { hpdata_disallow_hugify(ps); } } @@ -394,8 +474,7 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { * This value protects two things: * 1. Stack size * 2. Number of huge pages that are being purged in a batch as - * we do not allow allocations while making *madvise - * syscall. + * we do not allow allocations while making madvise syscall. */ #define HPA_PURGE_BATCH_MAX_DEFAULT 16 @@ -433,18 +512,16 @@ hpa_purge_actual_unlocked( hpa_range_accum_init(&accum, vec, len); for (size_t i = 0; i < batch_sz; ++i) { - hpdata_t *to_purge = batch[i].hp; - /* Actually do the purging, now that the lock is dropped. */ if (batch[i].dehugify) { shard->central->hooks.dehugify( - hpdata_addr_get(to_purge), HUGEPAGE); + hpdata_addr_get(batch[i].hp), HUGEPAGE); } void *purge_addr; size_t purge_size; size_t total_purged_on_one_hp = 0; while (hpdata_purge_next( - to_purge, &batch[i].state, &purge_addr, &purge_size)) { + batch[i].hp, &batch[i].state, &purge_addr, &purge_size)) { total_purged_on_one_hp += purge_size; assert(total_purged_on_one_hp <= HUGEPAGE); hpa_range_accum_add( @@ -454,14 +531,23 @@ hpa_purge_actual_unlocked( hpa_range_accum_finish(&accum, shard); } -/* Prepare purge of one page. Return num of dirty regular pages on it +static inline bool +hpa_needs_dehugify(hpa_shard_t *shard, const hpdata_t *ps) { + return hpa_is_hugify_lazy(shard) && hpdata_huge_get(ps) + && !hpdata_empty(ps); +} + +/* Prepare purge of one page. Return number of dirty regular pages on it * Return 0 if no purgable huge page is found * * If there was a page to purge its purge state is initialized */ static inline size_t -hpa_purge_start_hp(hpa_purge_batch_t *b, psset_t *psset) { - hpdata_t *to_purge = psset_pick_purge(psset); +hpa_purge_start_hp(hpa_purge_batch_t *b, hpa_shard_t *shard) { + psset_t *psset = &shard->psset; + hpdata_t *to_purge = (shard->opts.min_purge_delay_ms > 0) + ? psset_pick_purge(psset, &shard->last_time_work_attempted) + : psset_pick_purge(psset, NULL); if (to_purge == NULL) { return 0; } @@ -493,7 +579,9 @@ hpa_purge_start_hp(hpa_purge_batch_t *b, psset_t *psset) { b->item_cnt++; hp_item->hp = to_purge; /* Gather all the metadata we'll need during the purge. */ - hp_item->dehugify = hpdata_huge_get(hp_item->hp); + hp_item->dehugify = hpa_needs_dehugify(shard, hp_item->hp); + hpdata_purged_when_empty_and_huge_set(hp_item->hp, + hpdata_huge_get(hp_item->hp) && hpdata_empty(hp_item->hp)); size_t nranges; size_t ndirty = hpdata_purge_begin( hp_item->hp, &hp_item->state, &nranges); @@ -513,7 +601,11 @@ hpa_purge_finish_hp( } /* The hpdata updates. */ psset_update_begin(&shard->psset, hp_item->hp); - if (hp_item->dehugify) { + if (hpdata_huge_get(hp_item->hp)) { + /* + * Even when dehugify is not explicitly called, the page is + * assumed to be non-huge after purge. + */ hpdata_dehugify(hp_item->hp); } hpdata_purge_end(hp_item->hp, &hp_item->state); @@ -569,8 +661,7 @@ hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { assert(hpa_batch_empty(&batch)); while ( !hpa_batch_full(&batch) && hpa_should_purge(tsdn, shard)) { - size_t ndirty = hpa_purge_start_hp( - &batch, &shard->psset); + size_t ndirty = hpa_purge_start_hp(&batch, shard); if (ndirty == 0) { break; } @@ -633,25 +724,33 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { hpdata_disallow_hugify(to_hugify); assert(hpdata_alloc_allowed_get(to_hugify)); psset_update_end(&shard->psset, to_hugify); - - malloc_mutex_unlock(tsdn, &shard->mtx); - - bool err = shard->central->hooks.hugify( - hpdata_addr_get(to_hugify), HUGEPAGE, shard->opts.hugify_sync); - - malloc_mutex_lock(tsdn, &shard->mtx); - shard->stats.nhugifies++; - if (err) { - /* - * When asynchronous hugification is used - * (shard->opts.hugify_sync option is false), we are not - * expecting to get here, unless something went terrible wrong. - * Because underlying syscall is only setting kernel flag for - * memory range (actual hugification happens asynchronously - * and we are not getting any feedback about its outcome), we - * expect syscall to be successful all the time. - */ - shard->stats.nhugify_failures++; + /* + * Without lazy hugification, user relies on eagerly setting HG bit, or + * leaving everything up to the kernel (ex: thp enabled=always). We + * will still pretend that call succeeds to keep our accounting close to + * what user believes is the truth on the target system, but we won't + * update nhugifies stat as system call is not being made. + */ + if (hpa_is_hugify_lazy(shard)) { + malloc_mutex_unlock(tsdn, &shard->mtx); + bool err = shard->central->hooks.hugify( + hpdata_addr_get(to_hugify), HUGEPAGE, + shard->opts.hugify_sync); + malloc_mutex_lock(tsdn, &shard->mtx); + shard->stats.nhugifies++; + if (err) { + /* + * When asynchronous hugification is used + * (shard->opts.hugify_sync option is false), we are not + * expecting to get here, unless something went terrible + * wrong. Because underlying syscall is only setting + * kernel flag for memory range (actual hugification + * happens asynchronously and we are not getting any + * feedback about its outcome), we expect syscall to be + * successful all the time. + */ + shard->stats.nhugify_failures++; + } } psset_update_begin(&shard->psset, to_hugify); @@ -666,11 +765,18 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { static bool hpa_min_purge_interval_passed(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); - uint64_t since_last_purge_ms = shard->central->hooks.ms_since( - &shard->last_purge); + uint64_t since_last_purge_ms = nstime_ms_between( + &shard->last_purge, &shard->last_time_work_attempted); return since_last_purge_ms >= shard->opts.min_purge_interval_ms; } +static inline void +hpa_update_time_work_attempted(tsdn_t *tsdn, hpa_shard_t *shard) { + malloc_mutex_assert_owner(tsdn, &shard->mtx); + shard->central->hooks.curtime(&shard->last_time_work_attempted, + /* first_reading */ false); +} + /* * Execution of deferred work is forced if it's triggered by an explicit * hpa_shard_do_deferred_work() call. @@ -682,6 +788,7 @@ hpa_shard_maybe_do_deferred_work( if (!forced && shard->opts.deferral_allowed) { return; } + hpa_update_time_work_attempted(tsdn, shard); /* * If we're on a background thread, do work so long as there's work to @@ -753,8 +860,8 @@ hpa_try_alloc_one_no_grow( * If the pageslab used to be empty, treat it as though it's * brand new for fragmentation-avoidance purposes; what we're * trying to approximate is the age of the allocations *in* that - * pageslab, and the allocations in the new pageslab are - * definitionally the youngest in this hpa shard. + * pageslab, and the allocations in the new pageslab are by + * definition the youngest in this hpa shard. */ hpdata_age_set(ps, shard->age_counter++); } @@ -861,8 +968,8 @@ hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, * deallocations (and allocations of smaller sizes) may still succeed * while we're doing this potentially expensive system call. */ - hpdata_t *ps = hpa_central_extract( - tsdn, shard->central, size, shard->age_counter++, &oom); + hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, + shard->age_counter++, hpa_is_hugify_eager(shard), &oom); if (ps == NULL) { malloc_mutex_unlock(tsdn, &shard->grow_mtx); return nsuccess; diff --git a/src/hpa_hooks.c b/src/hpa_hooks.c index 14005ae0..2ec7029d 100644 --- a/src/hpa_hooks.c +++ b/src/hpa_hooks.c @@ -19,7 +19,13 @@ const hpa_hooks_t hpa_hooks_default = {&hpa_hooks_map, &hpa_hooks_unmap, static void * hpa_hooks_map(size_t size) { + /* + * During development, we're primarily concerned with systems + * that overcommit. Eventually, we should be more careful here. + */ + bool commit = true; + assert((size & HUGEPAGE_MASK) == 0); void *ret = pages_map(NULL, size, HUGEPAGE, &commit); JE_USDT(hpa_map, 2, size, ret); return ret; diff --git a/src/hpdata.c b/src/hpdata.c index f9c8f4fa..e17d9ecf 100644 --- a/src/hpdata.c +++ b/src/hpdata.c @@ -17,11 +17,10 @@ hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) { ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp) -void -hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { + void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge) { hpdata_addr_set(hpdata, addr); hpdata_age_set(hpdata, age); - hpdata->h_huge = false; + hpdata->h_huge = is_huge; hpdata->h_alloc_allowed = true; hpdata->h_in_psset_alloc_container = false; hpdata->h_purge_allowed = false; @@ -34,8 +33,16 @@ hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age) { hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES); hpdata->h_nactive = 0; fb_init(hpdata->active_pages, HUGEPAGE_PAGES); - hpdata->h_ntouched = 0; - fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); + if (is_huge) { + fb_set_range( + hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES); + hpdata->h_ntouched = HUGEPAGE_PAGES; + } else { + fb_init(hpdata->touched_pages, HUGEPAGE_PAGES); + hpdata->h_ntouched = 0; + } + nstime_init_zero(&hpdata->h_time_purge_allowed); + hpdata->h_purged_when_empty_and_huge = false; hpdata_assert_consistent(hpdata); } diff --git a/src/jemalloc.c b/src/jemalloc.c index a3f01b3c..72216508 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1619,6 +1619,50 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], opt_hpa_opts.experimental_max_purge_nhp, "experimental_hpa_max_purge_nhp", -1, SSIZE_MAX); + /* + * Accept either a ratio-based or an exact purge + * threshold. + */ + CONF_HANDLE_SIZE_T(opt_hpa_opts.purge_threshold, + "hpa_purge_threshold", PAGE, HUGEPAGE, + CONF_CHECK_MIN, CONF_CHECK_MAX, true); + if (CONF_MATCH("hpa_purge_threshold_ratio")) { + fxp_t ratio; + char *end; + bool err = fxp_parse(&ratio, v, &end); + if (err || (size_t)(end - v) != vlen + || ratio > FXP_INIT_INT(1)) { + CONF_ERROR("Invalid conf value", k, + klen, v, vlen); + } else { + opt_hpa_opts.purge_threshold = + fxp_mul_frac(HUGEPAGE, ratio); + } + CONF_CONTINUE; + } + + CONF_HANDLE_UINT64_T(opt_hpa_opts.min_purge_delay_ms, + "hpa_min_purge_delay_ms", 0, UINT64_MAX, + CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false); + + if (strncmp("hpa_hugify_style", k, klen) == 0) { + bool match = false; + for (int m = 0; m < hpa_hugify_style_limit; m++) { + if (strncmp(hpa_hugify_style_names[m], + v, vlen) + == 0) { + opt_hpa_opts.hugify_style = m; + match = true; + break; + } + } + if (!match) { + CONF_ERROR("Invalid conf value", k, + klen, v, vlen); + } + CONF_CONTINUE; + } + if (CONF_MATCH("hpa_dirty_mult")) { if (CONF_MATCH_VALUE("-1")) { opt_hpa_opts.dirty_mult = (fxp_t)-1; diff --git a/src/nstime.c b/src/nstime.c index ee2ddc51..0dfbeda1 100644 --- a/src/nstime.c +++ b/src/nstime.c @@ -160,6 +160,19 @@ nstime_divide(const nstime_t *time, const nstime_t *divisor) { return time->ns / divisor->ns; } +uint64_t +nstime_ns_between(const nstime_t *earlier, const nstime_t *later) { + nstime_assert_initialized(earlier); + nstime_assert_initialized(later); + assert(nstime_compare(later, earlier) >= 0); + return later->ns - earlier->ns; +} + +uint64_t +nstime_ms_between(const nstime_t *earlier, const nstime_t *later) { + return nstime_ns_between(earlier, later) / MILLION; +} + /* Returns time since *past in nanoseconds, w/o updating *past. */ uint64_t nstime_ns_since(const nstime_t *past) { @@ -168,9 +181,7 @@ nstime_ns_since(const nstime_t *past) { nstime_t now; nstime_copy(&now, past); nstime_update(&now); - - assert(nstime_compare(&now, past) >= 0); - return now.ns - past->ns; + return nstime_ns_between(past, &now); } /* Returns time since *past in milliseconds, w/o updating *past. */ diff --git a/src/pages.c b/src/pages.c index bc1093a3..000b87fe 100644 --- a/src/pages.c +++ b/src/pages.c @@ -833,9 +833,19 @@ init_thp_state(void) { } else { goto label_error; } + if (opt_hpa_opts.hugify_style == hpa_hugify_style_auto) { + if (init_system_thp_mode == thp_mode_default) { + opt_hpa_opts.hugify_style = hpa_hugify_style_lazy; + } else { + opt_hpa_opts.hugify_style = hpa_hugify_style_none; + } + } return; #elif defined(JEMALLOC_HAVE_MEMCNTL) init_system_thp_mode = thp_mode_default; + if (opt_hpa_opts.hugify_style == hpa_hugify_style_auto) { + opt_hpa_opts.hugify_style = hpa_hugify_style_eager; + } return; #endif label_error: diff --git a/src/psset.c b/src/psset.c index 509df064..a8a9615d 100644 --- a/src/psset.c +++ b/src/psset.c @@ -390,17 +390,37 @@ psset_pick_alloc(psset_t *psset, size_t size) { } hpdata_t * -psset_pick_purge(psset_t *psset) { - ssize_t ind_ssz = fb_fls( - psset->purge_bitmap, PSSET_NPURGE_LISTS, PSSET_NPURGE_LISTS - 1); - if (ind_ssz < 0) { - return NULL; +psset_pick_purge(psset_t *psset, const nstime_t *now) { + size_t max_bit = PSSET_NPURGE_LISTS - 1; + while (1) { + ssize_t ind_ssz = fb_fls( + psset->purge_bitmap, PSSET_NPURGE_LISTS, max_bit); + if (ind_ssz < 0) { + break; + } + pszind_t ind = (pszind_t)ind_ssz; + assert(ind < PSSET_NPURGE_LISTS); + hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]); + assert(ps != NULL); + if (now == NULL) { + return ps; + } + /* + * We only check the first page (it had least recent hpa_alloc + * or hpa_dalloc). It is possible that some page in the list + * would meet the time, but we only guarantee the min delay. If + * we want to get the one that changed the state to purgable + * the earliest, we would change the list into a heap ordered by + * time. We will use benchmark to make a decision. + */ + const nstime_t *tm_allowed = hpdata_time_purge_allowed_get(ps); + if (nstime_compare(tm_allowed, now) <= 0) { + return ps; + } + max_bit--; } - pszind_t ind = (pszind_t)ind_ssz; - assert(ind < PSSET_NPURGE_LISTS); - hpdata_t *ps = hpdata_purge_list_first(&psset->to_purge[ind]); - assert(ps != NULL); - return ps; + /* No page is ready yet */ + return NULL; } hpdata_t * diff --git a/src/stats.c b/src/stats.c index a8a574ac..ea7a4e2e 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1618,6 +1618,9 @@ stats_general_print(emitter_t *emitter) { "opt.hpa_dirty_mult", emitter_type_string, &bufp); } } + OPT_WRITE_SIZE_T("hpa_purge_threshold") + OPT_WRITE_UINT64("hpa_min_purge_delay_ms") + OPT_WRITE_CHAR_P("hpa_hugify_style") OPT_WRITE_SIZE_T("hpa_sec_nshards") OPT_WRITE_SIZE_T("hpa_sec_max_alloc") OPT_WRITE_SIZE_T("hpa_sec_max_bytes") diff --git a/test/unit/hpa.c b/test/unit/hpa.c index 1fed8a80..df2c9d96 100644 --- a/test/unit/hpa.c +++ b/test/unit/hpa.c @@ -37,7 +37,13 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ - -1}; + -1, + /* purge_threshold */ + 1, + /* min_purge_delay_ms */ + 0, + /* hugify_style */ + hpa_hugify_style_lazy}; static hpa_shard_opts_t test_hpa_shard_opts_purge = { /* slab_max_alloc */ @@ -55,7 +61,37 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = { /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ - -1}; + -1, + /* purge_threshold */ + 1, + /* min_purge_delay_ms */ + 0, + /* hugify_style */ + hpa_hugify_style_lazy}; + +static hpa_shard_opts_t test_hpa_shard_opts_aggressive = { + /* slab_max_alloc */ + HUGEPAGE, + /* hugification_threshold */ + 0.9 * HUGEPAGE, + /* dirty_mult */ + FXP_INIT_PERCENT(11), + /* deferral_allowed */ + true, + /* hugify_delay_ms */ + 0, + /* hugify_sync */ + false, + /* min_purge_interval_ms */ + 5, + /* experimental_max_purge_nhp */ + -1, + /* purge_threshold */ + HUGEPAGE - 5 * PAGE, + /* min_purge_delay_ms */ + 10, + /* hugify_style */ + hpa_hugify_style_eager}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { @@ -365,10 +401,11 @@ defer_test_unmap(void *ptr, size_t size) { } static size_t ndefer_purge_calls = 0; +static size_t npurge_size = 0; static void defer_test_purge(void *ptr, size_t size) { (void)ptr; - (void)size; + npurge_size = size; ++ndefer_purge_calls; } @@ -783,6 +820,625 @@ TEST_BEGIN(test_vectorized_opt_eq_zero) { } TEST_END +TEST_BEGIN(test_starts_huge) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0) + || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.min_purge_delay_ms = 10; + opts.min_purge_interval_ms = 0; + + defer_vectorized_purge_called = false; + ndefer_purge_calls = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + nstime_init2(&defer_curtime, 100, 0); + + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = 2 * HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate 75% */ + int pages_to_deallocate = (int)(0.75 * NALLOCS); + for (int i = 0; i < pages_to_deallocate; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + + /* + * While there is enough to purge as we have one empty page and that + * one meets the threshold, we need to respect the delay, so no purging + * should happen yet. + */ + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(0, ndefer_purge_calls, "Purged too early, delay==10ms"); + + nstime_iadd(&defer_curtime, opts.min_purge_delay_ms * 1000 * 1000); + /* Now, enough time has passed, so we expect to purge */ + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, "Expected purge"); + + /* + * We purged one hugepage, so we expect to have one non-full page and it + * should have half of the other dirty. + */ + psset_stats_t *stat = &shard->psset.stats; + expect_zu_eq( + stat->empty_slabs[1].npageslabs, 0, "Expected zero huge slabs"); + expect_zu_eq(stat->empty_slabs[0].npageslabs, 1, "Expected 1 nh slab"); + expect_zu_eq(stat->full_slabs[0].npageslabs, 0, ""); + expect_zu_eq(stat->full_slabs[1].npageslabs, 0, ""); + expect_zu_eq( + stat->merged.ndirty, HUGEPAGE_PAGES / 2, "One HP half dirty"); + + /* + * We now allocate one more PAGE than a half the hugepage because we + * want to make sure that one more hugepage is needed. + */ + deferred_work_generated = false; + const size_t HALF = HUGEPAGE_PAGES / 2; + edatas[1] = pai_alloc(tsdn, &shard->pai, PAGE * (HALF + 1), PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[1], "Unexpected null edata"); + expect_false(deferred_work_generated, "No page is purgable"); + + expect_zu_eq(stat->empty_slabs[1].npageslabs, 0, ""); + expect_zu_eq(stat->empty_slabs[0].npageslabs, 0, ""); + expect_zu_eq(stat->full_slabs[0].npageslabs, 0, ""); + expect_zu_eq(stat->full_slabs[1].npageslabs, 0, ""); + + /* + * We expect that all inactive bytes on the second page are counted as + * dirty (this is because the page was huge and empty when we purged + * it, thus, it is assumed to come back as huge, thus all the bytes are + * counted as touched). + */ + expect_zu_eq(stat->merged.ndirty, 2 * HALF - 1, + "2nd page is huge because it was empty and huge when purged"); + expect_zu_eq(stat->merged.nactive, HALF + (HALF + 1), "1st + 2nd"); + + nstime_iadd(&defer_curtime, opts.min_purge_delay_ms * 1000 * 1000); + pai_dalloc(tsdn, &shard->pai, edatas[1], &deferred_work_generated); + expect_true(deferred_work_generated, ""); + expect_zu_eq(stat->merged.ndirty, 3 * HALF, "1st + 2nd"); + + /* + * Deallocate last allocation and confirm that page is empty again, and + * once new minimum delay is reached, page should be purged. + */ + ndefer_purge_calls = 0; + nstime_iadd(&defer_curtime, opts.min_purge_delay_ms * 1000 * 1000); + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, ""); + expect_zu_eq(stat->merged.ndirty, HALF, "2nd cleared as it was empty"); + ndefer_purge_calls = 0; + + /* Deallocate all the rest, but leave only two active */ + for (int i = pages_to_deallocate; i < NALLOCS - 2; ++i) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + + /* + * With prior pai_dalloc our last page becomes purgable, however we + * still want to respect the delay. Thus, it is not time to purge yet. + */ + hpa_shard_do_deferred_work(tsdn, shard); + expect_true(deferred_work_generated, "Above limit, but not time yet"); + expect_zu_eq(0, ndefer_purge_calls, ""); + + /* + * Finally, we move the time ahead, and we confirm that purge happens + * and that we have exactly two active base pages and none dirty. + */ + nstime_iadd(&defer_curtime, opts.min_purge_delay_ms * 1000 * 1000); + hpa_shard_do_deferred_work(tsdn, shard); + expect_true(deferred_work_generated, "Above limit, but not time yet"); + expect_zu_eq(1, ndefer_purge_calls, ""); + expect_zu_eq(stat->merged.ndirty, 0, "Purged all"); + expect_zu_eq(stat->merged.nactive, 2, "1st only"); + + ndefer_purge_calls = 0; + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_start_huge_purge_empty_only) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0) + || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = HUGEPAGE; + opts.min_purge_delay_ms = 0; + opts.hugify_style = hpa_hugify_style_eager; + opts.min_purge_interval_ms = 0; + + ndefer_purge_calls = 0; + npurge_size = 0; + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + nstime_init(&defer_curtime, 10 * 1000 * 1000); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = 2 * HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate all from the first and one PAGE from the second HP. */ + for (int i = 0; i < NALLOCS / 2 + 1; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard); + expect_true(deferred_work_generated, ""); + expect_zu_eq(1, ndefer_purge_calls, "Should purge, delay==0ms"); + expect_zu_eq(HUGEPAGE, npurge_size, "Purge whole folio"); + expect_zu_eq(shard->psset.stats.merged.ndirty, 1, ""); + expect_zu_eq(shard->psset.stats.merged.nactive, HUGEPAGE_PAGES - 1, ""); + + ndefer_purge_calls = 0; + npurge_size = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(0, ndefer_purge_calls, "Should not purge anything"); + + /* Allocate and free 2*PAGE so that it spills into second page again */ + edatas[0] = pai_alloc(tsdn, &shard->pai, 2 * PAGE, PAGE, false, false, + false, &deferred_work_generated); + pai_dalloc(tsdn, &shard->pai, edatas[0], &deferred_work_generated); + expect_true(deferred_work_generated, ""); + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, "Should purge, delay==0ms"); + expect_zu_eq(HUGEPAGE, npurge_size, "Purge whole folio"); + + ndefer_purge_calls = 0; + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_assume_huge_purge_fully) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0) + || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = PAGE; + opts.hugification_threshold = HUGEPAGE; + opts.min_purge_delay_ms = 0; + opts.min_purge_interval_ms = 0; + opts.hugify_style = hpa_hugify_style_eager; + opts.dirty_mult = FXP_INIT_PERCENT(1); + + ndefer_purge_calls = 0; + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + nstime_init(&defer_curtime, 10 * 1000 * 1000); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate all */ + for (int i = 0; i < NALLOCS; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard); + expect_true(deferred_work_generated, ""); + expect_zu_eq(1, ndefer_purge_calls, "Should purge, delay==0ms"); + + /* Stats should say no active */ + expect_zu_eq(shard->psset.stats.merged.nactive, 0, ""); + expect_zu_eq( + shard->psset.stats.empty_slabs[0].npageslabs, 1, "Non huge"); + npurge_size = 0; + edatas[0] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, false, + false, &deferred_work_generated); + expect_ptr_not_null(edatas[0], "Unexpected null edata"); + expect_zu_eq(shard->psset.stats.merged.nactive, 1, ""); + expect_zu_eq(shard->psset.stats.slabs[1].npageslabs, 1, "Huge nonfull"); + pai_dalloc(tsdn, &shard->pai, edatas[0], &deferred_work_generated); + expect_true(deferred_work_generated, ""); + ndefer_purge_calls = 0; + npurge_size = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, "Should purge, delay==0ms"); + expect_zu_eq(HUGEPAGE, npurge_size, "Should purge full folio"); + + /* Now allocate all, free 10%, alloc 5%, assert non-huge */ + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + int ten_pct = NALLOCS / 10; + for (int i = 0; i < ten_pct; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + ndefer_purge_calls = 0; + npurge_size = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, "Should purge, delay==0ms"); + expect_zu_eq( + ten_pct * PAGE, npurge_size, "Should purge 10 percent of pages"); + + for (int i = 0; i < ten_pct / 2; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + expect_zu_eq( + shard->psset.stats.slabs[0].npageslabs, 1, "Nonhuge nonfull"); + expect_zu_eq(shard->psset.stats.merged.ndirty, 0, "No dirty"); + + npurge_size = 0; + ndefer_purge_calls = 0; + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_eager_with_purge_threshold) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0)); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + const size_t THRESHOLD = 10; + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = THRESHOLD * PAGE; + opts.min_purge_delay_ms = 0; + opts.hugify_style = hpa_hugify_style_eager; + opts.dirty_mult = FXP_INIT_PERCENT(0); + + ndefer_purge_calls = 0; + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + nstime_init(&defer_curtime, 10 * 1000 * 1000); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate less then threshold PAGEs. */ + for (size_t i = 0; i < THRESHOLD - 1; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard); + expect_false(deferred_work_generated, "No page is purgable"); + expect_zu_eq(0, ndefer_purge_calls, "Should not purge yet"); + /* Deallocate one more page to meet the threshold */ + pai_dalloc( + tsdn, &shard->pai, edatas[THRESHOLD - 1], &deferred_work_generated); + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(1, ndefer_purge_calls, "Should purge"); + expect_zu_eq(shard->psset.stats.merged.ndirty, 0, ""); + + ndefer_purge_calls = 0; + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_delay_when_not_allowed_deferral) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0)); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + const uint64_t DELAY_NS = 100 * 1000 * 1000; + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = false; + opts.purge_threshold = HUGEPAGE - 2 * PAGE; + opts.min_purge_delay_ms = DELAY_NS / (1000 * 1000); + opts.hugify_style = hpa_hugify_style_lazy; + opts.min_purge_interval_ms = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + nstime_init2(&defer_curtime, 100, 0); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + ndefer_purge_calls = 0; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate all */ + for (int i = 0; i < NALLOCS; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + /* curtime = 100.0s */ + hpa_shard_do_deferred_work(tsdn, shard); + expect_true(deferred_work_generated, ""); + expect_zu_eq(0, ndefer_purge_calls, "Too early"); + + nstime_iadd(&defer_curtime, DELAY_NS - 1); + /* This activity will take the curtime=100.1 and reset purgability */ + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Dealloc all but 2 pages, purgable delay_ns later*/ + for (int i = 0; i < NALLOCS - 2; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + + nstime_iadd(&defer_curtime, DELAY_NS); + pai_dalloc( + tsdn, &shard->pai, edatas[NALLOCS - 1], &deferred_work_generated); + expect_true(ndefer_purge_calls > 0, "Should have purged"); + + ndefer_purge_calls = 0; + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_deferred_until_time) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0)); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = PAGE; + opts.min_purge_delay_ms = 1000; + opts.hugification_threshold = HUGEPAGE / 2; + opts.dirty_mult = FXP_INIT_PERCENT(10); + opts.hugify_style = hpa_hugify_style_none; + opts.min_purge_interval_ms = 500; + opts.hugify_delay_ms = 3000; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + /* Current time = 10ms */ + nstime_init(&defer_curtime, 10 * 1000 * 1000); + + /* Allocate one huge page */ + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + ndefer_purge_calls = 0; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate 25% */ + for (int i = 0; i < NALLOCS / 4; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + expect_true(deferred_work_generated, "We should hugify and purge"); + + /* Current time = 300ms, purge_eligible at 300ms + 1000ms */ + nstime_init(&defer_curtime, 300UL * 1000 * 1000); + for (int i = NALLOCS / 4; i < NALLOCS; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + expect_true(deferred_work_generated, "Purge work generated"); + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(0, ndefer_purge_calls, "not time for purging yet"); + + /* Current time = 900ms, purge_eligible at 1300ms */ + nstime_init(&defer_curtime, 900UL * 1000 * 1000); + uint64_t until_ns = pai_time_until_deferred_work(tsdn, &shard->pai); + expect_u64_eq(until_ns, BACKGROUND_THREAD_DEFERRED_MIN, + "First pass did not happen"); + + /* Fake that first pass happened more than min_purge_interval_ago */ + nstime_init(&shard->last_purge, 350UL * 1000 * 1000); + shard->stats.npurge_passes = 1; + until_ns = pai_time_until_deferred_work(tsdn, &shard->pai); + expect_u64_eq(until_ns, BACKGROUND_THREAD_DEFERRED_MIN, + "No need to heck anything it is more than interval"); + + nstime_init(&shard->last_purge, 900UL * 1000 * 1000); + nstime_init(&defer_curtime, 1000UL * 1000 * 1000); + /* Next purge expected at 900ms + min_purge_interval = 1400ms */ + uint64_t expected_ms = 1400 - 1000; + until_ns = pai_time_until_deferred_work(tsdn, &shard->pai); + expect_u64_eq(expected_ms, until_ns / (1000 * 1000), "Next in 400ms"); + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_eager_no_hugify_on_threshold) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0) + || !config_stats); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = PAGE; + opts.min_purge_delay_ms = 0; + opts.hugification_threshold = HUGEPAGE * 0.9; + opts.dirty_mult = FXP_INIT_PERCENT(10); + opts.hugify_style = hpa_hugify_style_eager; + opts.min_purge_interval_ms = 0; + opts.hugify_delay_ms = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + /* Current time = 10ms */ + nstime_init(&defer_curtime, 10 * 1000 * 1000); + + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + /* First allocation makes the page huge */ + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + ndefer_purge_calls = 0; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + ndefer_hugify_calls = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(ndefer_hugify_calls, 0, "No hugify needed - eager"); + expect_zu_eq(shard->psset.stats.full_slabs[1].npageslabs, 1, + "Page should be full-huge"); + + /* Deallocate 25% */ + for (int i = 0; i < NALLOCS / 4; i++) { + pai_dalloc( + tsdn, &shard->pai, edatas[i], &deferred_work_generated); + } + expect_true(deferred_work_generated, "purge is needed"); + ndefer_purge_calls = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(ndefer_hugify_calls, 0, "No hugify needed - eager"); + expect_zu_eq(ndefer_purge_calls, 1, "Purge should have happened"); + + /* Allocate 20% again, so that we are above hugification threshold */ + ndefer_purge_calls = 0; + nstime_iadd(&defer_curtime, 800UL * 1000 * 1000); + for (int i = 0; i < NALLOCS / 4 - 1; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(0, ndefer_purge_calls, "no purging needed"); + expect_zu_eq(ndefer_hugify_calls, 0, "no hugify - eager"); + destroy_test_data(shard); +} +TEST_END + +TEST_BEGIN(test_hpa_hugify_style_none_huge_no_syscall) { + test_skip_if(!hpa_supported() || (opt_process_madvise_max_batch != 0)); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + hooks.vectorized_purge = &defer_vectorized_purge; + + hpa_shard_opts_t opts = test_hpa_shard_opts_aggressive; + opts.deferral_allowed = true; + opts.purge_threshold = PAGE; + opts.min_purge_delay_ms = 0; + opts.hugification_threshold = HUGEPAGE * 0.25; + opts.dirty_mult = FXP_INIT_PERCENT(10); + opts.hugify_style = hpa_hugify_style_none; + opts.min_purge_interval_ms = 0; + opts.hugify_delay_ms = 0; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + bool deferred_work_generated = false; + /* Current time = 10ms */ + nstime_init(&defer_curtime, 10 * 1000 * 1000); + + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + /* First allocation makes the page huge */ + enum { NALLOCS = HUGEPAGE_PAGES }; + edata_t *edatas[NALLOCS]; + ndefer_purge_calls = 0; + for (int i = 0; i < NALLOCS / 2; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + hpdata_t *ps = psset_pick_alloc(&shard->psset, PAGE); + expect_false(hpdata_huge_get(ps), "Page should be non-huge"); + + ndefer_hugify_calls = 0; + ndefer_purge_calls = 0; + hpa_shard_do_deferred_work(tsdn, shard); + expect_zu_eq(ndefer_hugify_calls, 0, "Hugify none, no syscall"); + ps = psset_pick_alloc(&shard->psset, PAGE); + expect_true(ps, "Page should be huge"); + + destroy_test_data(shard); +} +TEST_END + int main(void) { /* @@ -801,5 +1457,10 @@ main(void) { test_alloc_dalloc_batch, test_defer_time, test_purge_no_infinite_loop, test_no_min_purge_interval, test_min_purge_interval, test_purge, - test_experimental_max_purge_nhp, test_vectorized_opt_eq_zero); + test_experimental_max_purge_nhp, test_vectorized_opt_eq_zero, + test_starts_huge, test_start_huge_purge_empty_only, + test_assume_huge_purge_fully, test_eager_with_purge_threshold, + test_delay_when_not_allowed_deferral, test_deferred_until_time, + test_eager_no_hugify_on_threshold, + test_hpa_hugify_style_none_huge_no_syscall); } diff --git a/test/unit/hpa_vectorized_madvise.c b/test/unit/hpa_vectorized_madvise.c index 8df54d06..c66811e1 100644 --- a/test/unit/hpa_vectorized_madvise.c +++ b/test/unit/hpa_vectorized_madvise.c @@ -37,7 +37,13 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ - -1}; + -1, + /* purge_threshold */ + 1, + /* purge_delay_ms */ + 0, + /* hugify_style */ + hpa_hugify_style_lazy}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { diff --git a/test/unit/hpa_vectorized_madvise_large_batch.c b/test/unit/hpa_vectorized_madvise_large_batch.c index a5766620..8e7be7c0 100644 --- a/test/unit/hpa_vectorized_madvise_large_batch.c +++ b/test/unit/hpa_vectorized_madvise_large_batch.c @@ -37,7 +37,13 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ - -1}; + -1, + /* purge_threshold */ + 1, + /* min_purge_delay_ms */ + 0, + /* hugify_style */ + hpa_hugify_style_lazy}; static hpa_shard_t * create_test_data(const hpa_hooks_t *hooks, hpa_shard_opts_t *opts) { diff --git a/test/unit/hpdata.c b/test/unit/hpdata.c index 2329f065..ac45d697 100644 --- a/test/unit/hpdata.c +++ b/test/unit/hpdata.c @@ -5,7 +5,7 @@ TEST_BEGIN(test_reserve_alloc) { hpdata_t hpdata; - hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE, /* is_huge */ false); /* Allocating a page at a time, we should do first fit. */ for (size_t i = 0; i < HUGEPAGE_PAGES; i++) { @@ -57,7 +57,7 @@ TEST_END TEST_BEGIN(test_purge_simple) { hpdata_t hpdata; - hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE, /* is_huge */ false); void *alloc = hpdata_reserve_alloc(&hpdata, HUGEPAGE_PAGES / 2 * PAGE); expect_ptr_eq(alloc, HPDATA_ADDR, ""); @@ -101,7 +101,7 @@ TEST_END */ TEST_BEGIN(test_purge_intervening_dalloc) { hpdata_t hpdata; - hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE, /* is_huge */ false); /* Allocate the first 3/4 of the pages. */ void *alloc = hpdata_reserve_alloc( @@ -164,7 +164,7 @@ TEST_BEGIN(test_purge_over_retained) { size_t purge_size; hpdata_t hpdata; - hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE, /* is_huge */ false); /* Allocate the first 3/4 of the pages. */ void *alloc = hpdata_reserve_alloc( @@ -238,7 +238,7 @@ TEST_END TEST_BEGIN(test_hugify) { hpdata_t hpdata; - hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE); + hpdata_init(&hpdata, HPDATA_ADDR, HPDATA_AGE, /* is_huge */ false); void *alloc = hpdata_reserve_alloc(&hpdata, HUGEPAGE / 2); expect_ptr_eq(alloc, HPDATA_ADDR, ""); diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index ac7506cf..d1974e0f 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -313,6 +313,9 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always); TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always); TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always); + TEST_MALLCTL_OPT(size_t, hpa_purge_threshold, always); + TEST_MALLCTL_OPT(uint64_t, hpa_min_purge_delay_ms, always); + TEST_MALLCTL_OPT(const char *, hpa_hugify_style, always); TEST_MALLCTL_OPT(unsigned, narenas, always); TEST_MALLCTL_OPT(const char *, percpu_arena, always); TEST_MALLCTL_OPT(size_t, oversize_threshold, always); diff --git a/test/unit/psset.c b/test/unit/psset.c index 73a9835a..3ce8e976 100644 --- a/test/unit/psset.c +++ b/test/unit/psset.c @@ -124,7 +124,8 @@ TEST_BEGIN(test_empty) { test_skip_if(hpa_hugepage_size_exceeds_limit()); bool err; hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t alloc; edata_init_test(&alloc); @@ -141,9 +142,10 @@ TEST_END TEST_BEGIN(test_fill) { test_skip_if(hpa_hugepage_size_exceeds_limit()); bool err; + bool is_huge = false; hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -179,7 +181,8 @@ TEST_BEGIN(test_reuse) { hpdata_t *ps; hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -274,7 +277,8 @@ TEST_BEGIN(test_evict) { hpdata_t *ps; hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -311,9 +315,10 @@ TEST_BEGIN(test_multi_pageslab) { hpdata_t *ps; hpdata_t pageslab[2]; - hpdata_init(&pageslab[0], PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab[0], PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); hpdata_init(&pageslab[1], (void *)((uintptr_t)PAGESLAB_ADDR + HUGEPAGE), - PAGESLAB_AGE + 1); + PAGESLAB_AGE + 1, is_huge); edata_t *alloc[2]; alloc[0] = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -376,7 +381,8 @@ TEST_END TEST_BEGIN(test_stats_merged) { hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -442,7 +448,8 @@ TEST_BEGIN(test_stats_huge) { test_skip_if(hpa_hugepage_size_exceeds_limit()); hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -570,7 +577,8 @@ TEST_BEGIN(test_stats_fullness) { bool err; hpdata_t pageslab; - hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE); + bool is_huge = false; + hpdata_init(&pageslab, PAGESLAB_ADDR, PAGESLAB_AGE, is_huge); edata_t *alloc = (edata_t *)malloc(sizeof(edata_t) * HUGEPAGE_PAGES); @@ -620,13 +628,15 @@ static void init_test_pageslabs(psset_t *psset, hpdata_t *pageslab, hpdata_t *worse_pageslab, edata_t *alloc, edata_t *worse_alloc) { bool err; + bool is_huge = false; - hpdata_init(pageslab, (void *)(10 * HUGEPAGE), PAGESLAB_AGE); + hpdata_init(pageslab, (void *)(10 * HUGEPAGE), PAGESLAB_AGE, is_huge); /* * This pageslab would be better from an address-first-fit POV, but * worse from an age POV. */ - hpdata_init(worse_pageslab, (void *)(9 * HUGEPAGE), PAGESLAB_AGE + 1); + hpdata_init( + worse_pageslab, (void *)(9 * HUGEPAGE), PAGESLAB_AGE + 1, is_huge); psset_init(psset); @@ -763,14 +773,15 @@ TEST_BEGIN(test_purge_prefers_nonhuge) { hpdata_t hpdata_nonhuge[NHP]; uintptr_t nonhuge_begin = (uintptr_t)&hpdata_nonhuge[0]; uintptr_t nonhuge_end = (uintptr_t)&hpdata_nonhuge[NHP]; + bool is_huge = false; for (size_t i = 0; i < NHP; i++) { - hpdata_init( - &hpdata_huge[i], (void *)((10 + i) * HUGEPAGE), 123 + i); + hpdata_init(&hpdata_huge[i], (void *)((10 + i) * HUGEPAGE), + 123 + i, is_huge); psset_insert(&psset, &hpdata_huge[i]); hpdata_init(&hpdata_nonhuge[i], - (void *)((10 + NHP + i) * HUGEPAGE), 456 + i); + (void *)((10 + NHP + i) * HUGEPAGE), 456 + i, is_huge); psset_insert(&psset, &hpdata_nonhuge[i]); } for (int i = 0; i < 2 * NHP; i++) { @@ -802,7 +813,7 @@ TEST_BEGIN(test_purge_prefers_nonhuge) { * further. */ for (int i = 0; i < NHP; i++) { - hpdata = psset_pick_purge(&psset); + hpdata = psset_pick_purge(&psset, NULL); assert_true(nonhuge_begin <= (uintptr_t)hpdata && (uintptr_t)hpdata < nonhuge_end, ""); @@ -812,7 +823,7 @@ TEST_BEGIN(test_purge_prefers_nonhuge) { psset_update_end(&psset, hpdata); } for (int i = 0; i < NHP; i++) { - hpdata = psset_pick_purge(&psset); + hpdata = psset_pick_purge(&psset, NULL); expect_true(huge_begin <= (uintptr_t)hpdata && (uintptr_t)hpdata < huge_end, ""); @@ -825,6 +836,72 @@ TEST_BEGIN(test_purge_prefers_nonhuge) { } TEST_END +TEST_BEGIN(test_purge_timing) { + test_skip_if(hpa_hugepage_size_exceeds_limit()); + void *ptr; + + psset_t psset; + psset_init(&psset); + + hpdata_t hpdata_empty_nh; + hpdata_t hpdata_empty_huge; + hpdata_t hpdata_nonempty; + + nstime_t basetime, now, empty_nh_tm, empty_huge_tm, nonempty_tm; + const uint64_t BASE_SEC = 100; + nstime_init2(&basetime, BASE_SEC, 0); + + /* Create and add to psset */ + hpdata_init(&hpdata_empty_nh, (void *)(9 * HUGEPAGE), 102, false); + psset_insert(&psset, &hpdata_empty_nh); + hpdata_init(&hpdata_empty_huge, (void *)(10 * HUGEPAGE), 123, true); + psset_insert(&psset, &hpdata_empty_huge); + hpdata_init(&hpdata_nonempty, (void *)(11 * HUGEPAGE), 456, false); + psset_insert(&psset, &hpdata_nonempty); + + psset_update_begin(&psset, &hpdata_empty_nh); + ptr = hpdata_reserve_alloc(&hpdata_empty_nh, PAGE); + expect_ptr_eq(hpdata_addr_get(&hpdata_empty_nh), ptr, ""); + hpdata_unreserve(&hpdata_empty_nh, ptr, PAGE); + hpdata_purge_allowed_set(&hpdata_empty_nh, true); + nstime_init2(&empty_nh_tm, BASE_SEC + 100, 0); + hpdata_time_purge_allowed_set(&hpdata_empty_nh, &empty_nh_tm); + psset_update_end(&psset, &hpdata_empty_nh); + + psset_update_begin(&psset, &hpdata_empty_huge); + ptr = hpdata_reserve_alloc(&hpdata_empty_huge, PAGE); + expect_ptr_eq(hpdata_addr_get(&hpdata_empty_huge), ptr, ""); + hpdata_unreserve(&hpdata_empty_huge, ptr, PAGE); + nstime_init2(&empty_huge_tm, BASE_SEC + 110, 0); + hpdata_time_purge_allowed_set(&hpdata_empty_huge, &empty_huge_tm); + hpdata_purge_allowed_set(&hpdata_empty_huge, true); + psset_update_end(&psset, &hpdata_empty_huge); + + psset_update_begin(&psset, &hpdata_nonempty); + ptr = hpdata_reserve_alloc(&hpdata_nonempty, 10 * PAGE); + expect_ptr_eq(hpdata_addr_get(&hpdata_nonempty), ptr, ""); + hpdata_unreserve(&hpdata_nonempty, ptr, 9 * PAGE); + hpdata_purge_allowed_set(&hpdata_nonempty, true); + nstime_init2(&nonempty_tm, BASE_SEC + 80, 0); + hpdata_time_purge_allowed_set(&hpdata_nonempty, &nonempty_tm); + psset_update_end(&psset, &hpdata_nonempty); + + /* The best to purge with no time restriction is the huge one */ + hpdata_t *ps = psset_pick_purge(&psset, NULL); + expect_ptr_eq(&hpdata_empty_huge, ps, "Without tick, pick huge"); + + /* However, only the one eligible for purging can be picked */ + nstime_init2(&now, BASE_SEC + 90, 0); + ps = psset_pick_purge(&psset, &now); + expect_ptr_eq(&hpdata_nonempty, ps, "Only non empty purgable"); + + /* When all eligible, huge empty is the best */ + nstime_init2(&now, BASE_SEC + 110, 0); + ps = psset_pick_purge(&psset, &now); + expect_ptr_eq(&hpdata_empty_huge, ps, "Huge empty is the best"); +} +TEST_END + TEST_BEGIN(test_purge_prefers_empty) { test_skip_if(hpa_hugepage_size_exceeds_limit()); void *ptr; @@ -834,9 +911,10 @@ TEST_BEGIN(test_purge_prefers_empty) { hpdata_t hpdata_empty; hpdata_t hpdata_nonempty; - hpdata_init(&hpdata_empty, (void *)(10 * HUGEPAGE), 123); + bool is_huge = false; + hpdata_init(&hpdata_empty, (void *)(10 * HUGEPAGE), 123, is_huge); psset_insert(&psset, &hpdata_empty); - hpdata_init(&hpdata_nonempty, (void *)(11 * HUGEPAGE), 456); + hpdata_init(&hpdata_nonempty, (void *)(11 * HUGEPAGE), 456, is_huge); psset_insert(&psset, &hpdata_nonempty); psset_update_begin(&psset, &hpdata_empty); @@ -857,7 +935,7 @@ TEST_BEGIN(test_purge_prefers_empty) { * The nonempty slab has 9 dirty pages, while the empty one has only 1. * We should still pick the empty one for purging. */ - hpdata_t *to_purge = psset_pick_purge(&psset); + hpdata_t *to_purge = psset_pick_purge(&psset, NULL); expect_ptr_eq(&hpdata_empty, to_purge, ""); } TEST_END @@ -876,13 +954,16 @@ TEST_BEGIN(test_purge_prefers_empty_huge) { uintptr_t cur_addr = 100 * HUGEPAGE; uint64_t cur_age = 123; + bool is_huge = false; for (int i = 0; i < NHP; i++) { - hpdata_init(&hpdata_huge[i], (void *)cur_addr, cur_age); + hpdata_init( + &hpdata_huge[i], (void *)cur_addr, cur_age, is_huge); cur_addr += HUGEPAGE; cur_age++; psset_insert(&psset, &hpdata_huge[i]); - hpdata_init(&hpdata_nonhuge[i], (void *)cur_addr, cur_age); + hpdata_init( + &hpdata_nonhuge[i], (void *)cur_addr, cur_age, is_huge); cur_addr += HUGEPAGE; cur_age++; psset_insert(&psset, &hpdata_nonhuge[i]); @@ -917,14 +998,14 @@ TEST_BEGIN(test_purge_prefers_empty_huge) { * any of the non-huge ones for purging. */ for (int i = 0; i < NHP; i++) { - hpdata_t *to_purge = psset_pick_purge(&psset); + hpdata_t *to_purge = psset_pick_purge(&psset, NULL); expect_ptr_eq(&hpdata_huge[i], to_purge, ""); psset_update_begin(&psset, to_purge); hpdata_purge_allowed_set(to_purge, false); psset_update_end(&psset, to_purge); } for (int i = 0; i < NHP; i++) { - hpdata_t *to_purge = psset_pick_purge(&psset); + hpdata_t *to_purge = psset_pick_purge(&psset, NULL); expect_ptr_eq(&hpdata_nonhuge[i], to_purge, ""); psset_update_begin(&psset, to_purge); hpdata_purge_allowed_set(to_purge, false); @@ -938,6 +1019,6 @@ main(void) { return test_no_reentrancy(test_empty, test_fill, test_reuse, test_evict, test_multi_pageslab, test_stats_merged, test_stats_huge, test_stats_fullness, test_oldest_fit, test_insert_remove, - test_purge_prefers_nonhuge, test_purge_prefers_empty, - test_purge_prefers_empty_huge); + test_purge_prefers_nonhuge, test_purge_timing, + test_purge_prefers_empty, test_purge_prefers_empty_huge); }