[HPA] Add ability to start page as huge and more flexibility for purging

This commit is contained in:
Slobodan Predolac 2025-08-25 13:23:07 -07:00 committed by Guangli Dai
parent ace437d26a
commit a199278f37
20 changed files with 1231 additions and 116 deletions

View file

@ -147,6 +147,15 @@ struct hpa_shard_s {
* Last time we performed purge on this shard.
*/
nstime_t last_purge;
/*
* Last time when we attempted work (purging or hugifying). If deferral
* of the work is allowed (we have background thread), this is the time
* when background thread checked if purging or hugifying needs to be
* done. If deferral is not allowed, this is the time of (hpa_alloc or
* hpa_dalloc) activity in the shard.
*/
nstime_t last_time_work_attempted;
};
bool hpa_hugepage_size_exceeds_limit(void);

View file

@ -7,8 +7,60 @@
/*
* This file is morally part of hpa.h, but is split out for header-ordering
* reasons.
*
* All of these hpa_shard_opts below are experimental. We are exploring more
* efficient packing, hugifying, and purging approaches to make efficient
* trade-offs between CPU, memory, latency, and usability. This means all of
* them are at the risk of being deprecated and corresponding configurations
* should be updated once the final version settles.
*/
/*
* This enum controls how jemalloc hugifies/dehugifies pages. Each style may be
* more suitable depending on deployment environments.
*
* hpa_hugify_style_none
* Using this means that jemalloc will not be hugifying or dehugifying pages,
* but will let the kernel make those decisions. This style only makes sense
* when deploying on systems where THP are enabled in 'always' mode. With this
* style, you most likely want to have no purging at all (dirty_mult=-1) or
* purge_threshold=HUGEPAGE bytes (2097152 for 2Mb page), although other
* thresholds may work well depending on kernel settings of your deployment
* targets.
*
* hpa_hugify_style_eager
* This style results in jemalloc giving hugepage advice, if needed, to
* anonymous memory immediately after it is mapped, so huge pages can be backing
* that memory at page-fault time. This is usually more efficient than doing
* it later, and it allows us to benefit from the hugepages from the start.
* Same options for purging as for the style 'none' are good starting choices:
* no purging, or purge_threshold=HUGEPAGE, some min_purge_delay_ms that allows
* for page not to be purged quickly, etc. This is a good choice if you can
* afford extra memory and your application gets performance increase from
* transparent hughepages.
*
* hpa_hugify_style_lazy
* This style is suitable when you purge more aggressively (you sacrifice CPU
* performance for less memory). When this style is chosen, jemalloc will
* hugify once hugification_threshold is reached, and dehugify before purging.
* If the kernel is configured to use direct compaction you may experience some
* allocation latency when using this style. The best is to measure what works
* better for your application needs, and in the target deployment environment.
* This is a good choice for apps that cannot afford a lot of memory regression,
* but would still like to benefit from backing certain memory regions with
* hugepages.
*/
enum hpa_hugify_style_e {
hpa_hugify_style_auto = 0,
hpa_hugify_style_none = 1,
hpa_hugify_style_eager = 2,
hpa_hugify_style_lazy = 3,
hpa_hugify_style_limit = hpa_hugify_style_lazy + 1
};
typedef enum hpa_hugify_style_e hpa_hugify_style_t;
extern const char *const hpa_hugify_style_names[];
typedef struct hpa_shard_opts_s hpa_shard_opts_t;
struct hpa_shard_opts_s {
/*
@ -46,7 +98,8 @@ struct hpa_shard_opts_s {
uint64_t hugify_delay_ms;
/*
* Hugify pages synchronously.
* Hugify pages synchronously (hugify will happen even if hugify_style
* is not hpa_hugify_style_lazy).
*/
bool hugify_sync;
@ -59,6 +112,46 @@ struct hpa_shard_opts_s {
* Maximum number of hugepages to purge on each purging attempt.
*/
ssize_t experimental_max_purge_nhp;
/*
* Minimum number of inactive bytes needed for a non-empty page to be
* considered purgable.
*
* When the number of touched inactive bytes on non-empty hugepage is
* >= purge_threshold, the page is purgable. Empty pages are always
* purgable. Setting this to HUGEPAGE bytes would only purge empty
* pages if using hugify_style_eager and the purges would be exactly
* HUGEPAGE bytes. Depending on your kernel settings, this may result
* in better performance.
*
* Please note, when threshold is reached, we will purge all the dirty
* bytes, and not just up to the threshold. If this is PAGE bytes, then
* all the pages that have any dirty bytes are purgable. We treat
* purgability constraint for purge_threshold as stronger than
* dirty_mult, IOW, if no page meets purge_threshold, we will not purge
* even if we are above dirty_mult.
*/
size_t purge_threshold;
/*
* Minimum number of ms that needs to elapse between HP page becoming
* eligible for purging and actually getting purged.
*
* Setting this to a larger number would give better chance of reusing
* that memory. Setting it to 0 means that page is eligible for purging
* as soon as it meets the purge_threshold. The clock resets when
* purgability of the page changes (page goes from being non-purgable to
* purgable). When using eager style you probably want to allow for
* some delay, to avoid purging the page too quickly and give it time to
* be used.
*/
uint64_t min_purge_delay_ms;
/*
* Style of hugification/dehugification (see comment at
* hpa_hugify_style_t for options).
*/
hpa_hugify_style_t hugify_style;
};
/* clang-format off */
@ -84,7 +177,13 @@ struct hpa_shard_opts_s {
/* min_purge_interval_ms */ \
5 * 1000, \
/* experimental_max_purge_nhp */ \
-1 \
-1, \
/* size_t purge_threshold */ \
PAGE, \
/* min_purge_delay_ms */ \
0, \
/* hugify_style */ \
hpa_hugify_style_lazy \
}
/* clang-format on */

View file

@ -124,6 +124,12 @@ struct hpdata_s {
/* The touched pages (using the same definition as above). */
fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
/* Time when this extent (hpdata) becomes eligible for purging */
nstime_t h_time_purge_allowed;
/* True if the extent was huge and empty last time when it was purged */
bool h_purged_when_empty_and_huge;
};
TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
@ -284,17 +290,17 @@ hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) {
}
static inline size_t
hpdata_nactive_get(hpdata_t *hpdata) {
hpdata_nactive_get(const hpdata_t *hpdata) {
return hpdata->h_nactive;
}
static inline size_t
hpdata_ntouched_get(hpdata_t *hpdata) {
hpdata_ntouched_get(const hpdata_t *hpdata) {
return hpdata->h_ntouched;
}
static inline size_t
hpdata_ndirty_get(hpdata_t *hpdata) {
hpdata_ndirty_get(const hpdata_t *hpdata) {
return hpdata->h_ntouched - hpdata->h_nactive;
}
@ -303,6 +309,26 @@ hpdata_nretained_get(hpdata_t *hpdata) {
return HUGEPAGE_PAGES - hpdata->h_ntouched;
}
static inline void
hpdata_time_purge_allowed_set(hpdata_t *hpdata, const nstime_t *v) {
nstime_copy(&hpdata->h_time_purge_allowed, v);
}
static inline const nstime_t *
hpdata_time_purge_allowed_get(const hpdata_t *hpdata) {
return &hpdata->h_time_purge_allowed;
}
static inline bool
hpdata_purged_when_empty_and_huge_get(const hpdata_t *hpdata) {
return hpdata->h_purged_when_empty_and_huge;
}
static inline void
hpdata_purged_when_empty_and_huge_set(hpdata_t *hpdata, bool v) {
hpdata->h_purged_when_empty_and_huge = v;
}
static inline void
hpdata_assert_empty(hpdata_t *hpdata) {
assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES));
@ -360,7 +386,7 @@ hpdata_full(const hpdata_t *hpdata) {
return hpdata->h_nactive == HUGEPAGE_PAGES;
}
void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge);
/*
* Given an hpdata which can serve an allocation request, pick and reserve an

View file

@ -40,6 +40,8 @@ void nstime_isubtract(nstime_t *time, uint64_t subtrahend);
void nstime_imultiply(nstime_t *time, uint64_t multiplier);
void nstime_idivide(nstime_t *time, uint64_t divisor);
uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor);
uint64_t nstime_ns_between(const nstime_t *earlier, const nstime_t *later);
uint64_t nstime_ms_between(const nstime_t *earlier, const nstime_t *later);
uint64_t nstime_ns_since(const nstime_t *past);
uint64_t nstime_ms_since(const nstime_t *past);
@ -67,7 +69,7 @@ nstime_init_zero(nstime_t *time) {
}
JEMALLOC_ALWAYS_INLINE bool
nstime_equals_zero(nstime_t *time) {
nstime_equals_zero(const nstime_t *time) {
int diff = nstime_compare(time, &nstime_zero);
assert(diff >= 0);
return diff == 0;

View file

@ -121,8 +121,12 @@ void psset_update_end(psset_t *psset, hpdata_t *ps);
/* Analogous to the eset_fit; pick a hpdata to serve the request. */
hpdata_t *psset_pick_alloc(psset_t *psset, size_t size);
/* Pick one to purge. */
hpdata_t *psset_pick_purge(psset_t *psset);
/*
* Pick one to purge that is purgable before given time (inclusive). If now
* is NULL then time is not considered.
*/
hpdata_t *psset_pick_purge(psset_t *psset, const nstime_t *now);
/* Pick one to hugify. */
hpdata_t *psset_pick_hugify(psset_t *psset);