[HPA] Add ability to start page as huge and more flexibility for purging

2026-07-14 04:37:18 +03:00 · 2025-08-25 13:23:07 -07:00 · 2025-08-25 13:23:07 -07:00 · a199278f37
commit a199278f37
parent ace437d26a
20 changed files with 1231 additions and 116 deletions
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -147,6 +147,15 @@ struct hpa_shard_s {
 	 * Last time we performed purge on this shard.
 	 */
 	nstime_t last_purge;
+
+	/*
+	 * Last time when we attempted work (purging or hugifying). If deferral
+	 * of the work is allowed (we have background thread), this is the time
+	 * when background thread checked if purging or hugifying needs to be
+	 * done. If deferral is not allowed, this is the time of (hpa_alloc or
+	 * hpa_dalloc) activity in the shard.
+	 */
+	nstime_t last_time_work_attempted;
 };

 bool hpa_hugepage_size_exceeds_limit(void);
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -7,8 +7,60 @@
 /*
 * This file is morally part of hpa.h, but is split out for header-ordering
 * reasons.
+ *
+ * All of these hpa_shard_opts below are experimental. We are exploring more
+ * efficient packing, hugifying, and purging approaches to make efficient
+ * trade-offs between CPU, memory, latency, and usability. This means all of
+ * them are at the risk of being deprecated and corresponding configurations
+ * should be updated once the final version settles.
 */

+/*
+ * This enum controls how jemalloc hugifies/dehugifies pages.  Each style may be
+ * more suitable depending on deployment environments.
+ *
+ * hpa_hugify_style_none
+ * Using this means that jemalloc will not be hugifying or dehugifying pages,
+ * but will let the kernel make those decisions.  This style only makes sense
+ * when deploying on systems where THP are enabled in 'always' mode.  With this
+ * style, you most likely want to have no purging at all (dirty_mult=-1) or
+ * purge_threshold=HUGEPAGE bytes (2097152 for 2Mb page), although other
+ * thresholds may work well depending on kernel settings of your deployment
+ * targets.
+ *
+ * hpa_hugify_style_eager
+ * This style results in jemalloc giving hugepage advice, if needed, to
+ * anonymous memory immediately after it is mapped, so huge pages can be backing
+ * that memory at page-fault time.  This is usually more efficient than doing
+ * it later, and it allows us to benefit from the hugepages from the start.
+ * Same options for purging as for the style 'none' are good starting choices:
+ * no purging, or purge_threshold=HUGEPAGE, some min_purge_delay_ms that allows
+ * for page not to be purged quickly, etc.  This is a good choice if you can
+ * afford extra memory and your application gets performance increase from
+ * transparent hughepages.
+ *
+ * hpa_hugify_style_lazy
+ * This style is suitable when you purge more aggressively (you sacrifice CPU
+ * performance for less memory).  When this style is chosen, jemalloc will
+ * hugify once hugification_threshold is reached, and dehugify before purging.
+ * If the kernel is configured to use direct compaction you may experience some
+ * allocation latency when using this style.  The best is to measure what works
+ * better for your application needs, and in the target deployment environment.
+ * This is a good choice for apps that cannot afford a lot of memory regression,
+ * but would still like to benefit from backing certain memory regions with
+ * hugepages.
+ */
+enum hpa_hugify_style_e {
+	hpa_hugify_style_auto = 0,
+	hpa_hugify_style_none = 1,
+	hpa_hugify_style_eager = 2,
+	hpa_hugify_style_lazy = 3,
+	hpa_hugify_style_limit = hpa_hugify_style_lazy + 1
+};
+typedef enum hpa_hugify_style_e hpa_hugify_style_t;
+
+extern const char *const hpa_hugify_style_names[];
+
 typedef struct hpa_shard_opts_s hpa_shard_opts_t;
 struct hpa_shard_opts_s {
 	/*
@ -46,7 +98,8 @@ struct hpa_shard_opts_s {
 	uint64_t hugify_delay_ms;

 	/*
-	 * Hugify pages synchronously.
+	 * Hugify pages synchronously (hugify will happen even if hugify_style
+	 * is not hpa_hugify_style_lazy).
 	 */
 	bool hugify_sync;

@ -59,6 +112,46 @@ struct hpa_shard_opts_s {
 	 * Maximum number of hugepages to purge on each purging attempt.
 	 */
 	ssize_t experimental_max_purge_nhp;
+
+	/*
+	 * Minimum number of inactive bytes needed for a non-empty page to be
+	 * considered purgable.
+	 *
+	 * When the number of touched inactive bytes on non-empty hugepage is
+	 * >= purge_threshold, the page is purgable.  Empty pages are always
+	 * purgable.  Setting this to HUGEPAGE bytes would only purge empty
+	 * pages if using hugify_style_eager and the purges would be exactly
+	 * HUGEPAGE bytes.  Depending on your kernel settings, this may result
+	 * in better performance.
+	 *
+	 * Please note, when threshold is reached, we will purge all the dirty
+	 * bytes, and not just up to the threshold.  If this is PAGE bytes, then
+	 * all the pages that have any dirty bytes are purgable.  We treat
+	 * purgability constraint for purge_threshold as stronger than
+	 * dirty_mult, IOW, if no page meets purge_threshold, we will not purge
+	 * even if we are above dirty_mult.
+	 */
+	size_t purge_threshold;
+
+	/*
+	 * Minimum number of ms that needs to elapse between HP page becoming
+	 * eligible for purging and actually getting purged.
+	 *
+	 * Setting this to a larger number would give better chance of reusing
+	 * that memory.  Setting it to 0 means that page is eligible for purging
+	 * as soon as it meets the purge_threshold.  The clock resets when
+	 * purgability of the page changes (page goes from being non-purgable to
+	 * purgable).  When using eager style you probably want to allow for
+	 * some delay, to avoid purging the page too quickly and give it time to
+	 * be used.
+	 */
+	uint64_t min_purge_delay_ms;
+
+	/*
+	 * Style of hugification/dehugification (see comment at
+	 * hpa_hugify_style_t for options).
+	 */
+	hpa_hugify_style_t hugify_style;
 };

 /* clang-format off */
@ -84,7 +177,13 @@ struct hpa_shard_opts_s {
 	/* min_purge_interval_ms */					\
 	5 * 1000,							\
 	/* experimental_max_purge_nhp */				\
-	-1								\
+	-1,      							\
+	/* size_t purge_threshold */					\
+	PAGE,								\
+	/* min_purge_delay_ms */             				\
+	0,  								\
+	/* hugify_style */                				\
+	hpa_hugify_style_lazy						\
 }
 /* clang-format on */

--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -124,6 +124,12 @@ struct hpdata_s {

 	/* The touched pages (using the same definition as above). */
 	fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+
+	/* Time when this extent (hpdata) becomes eligible for purging */
+	nstime_t h_time_purge_allowed;
+
+	/* True if the extent was huge and empty last time when it was purged */
+	bool h_purged_when_empty_and_huge;
 };

 TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
@ -284,17 +290,17 @@ hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) {
 }

 static inline size_t
-hpdata_nactive_get(hpdata_t *hpdata) {
+hpdata_nactive_get(const hpdata_t *hpdata) {
 	return hpdata->h_nactive;
 }

 static inline size_t
-hpdata_ntouched_get(hpdata_t *hpdata) {
+hpdata_ntouched_get(const hpdata_t *hpdata) {
 	return hpdata->h_ntouched;
 }

 static inline size_t
-hpdata_ndirty_get(hpdata_t *hpdata) {
+hpdata_ndirty_get(const hpdata_t *hpdata) {
 	return hpdata->h_ntouched - hpdata->h_nactive;
 }

@ -303,6 +309,26 @@ hpdata_nretained_get(hpdata_t *hpdata) {
 	return HUGEPAGE_PAGES - hpdata->h_ntouched;
 }

+static inline void
+hpdata_time_purge_allowed_set(hpdata_t *hpdata, const nstime_t *v) {
+	nstime_copy(&hpdata->h_time_purge_allowed, v);
+}
+
+static inline const nstime_t *
+hpdata_time_purge_allowed_get(const hpdata_t *hpdata) {
+	return &hpdata->h_time_purge_allowed;
+}
+
+static inline bool
+hpdata_purged_when_empty_and_huge_get(const hpdata_t *hpdata) {
+	return hpdata->h_purged_when_empty_and_huge;
+}
+
+static inline void
+hpdata_purged_when_empty_and_huge_set(hpdata_t *hpdata, bool v) {
+	hpdata->h_purged_when_empty_and_huge = v;
+}
+
 static inline void
 hpdata_assert_empty(hpdata_t *hpdata) {
 	assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES));
@ -360,7 +386,7 @@ hpdata_full(const hpdata_t *hpdata) {
 	return hpdata->h_nactive == HUGEPAGE_PAGES;
 }

-void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
+void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge);

 /*
 * Given an hpdata which can serve an allocation request, pick and reserve an
--- a/include/jemalloc/internal/nstime.h
+++ b/include/jemalloc/internal/nstime.h
@ -40,6 +40,8 @@ void     nstime_isubtract(nstime_t *time, uint64_t subtrahend);
 void     nstime_imultiply(nstime_t *time, uint64_t multiplier);
 void     nstime_idivide(nstime_t *time, uint64_t divisor);
 uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor);
+uint64_t nstime_ns_between(const nstime_t *earlier, const nstime_t *later);
+uint64_t nstime_ms_between(const nstime_t *earlier, const nstime_t *later);
 uint64_t nstime_ns_since(const nstime_t *past);
 uint64_t nstime_ms_since(const nstime_t *past);

@ -67,7 +69,7 @@ nstime_init_zero(nstime_t *time) {
 }

 JEMALLOC_ALWAYS_INLINE bool
-nstime_equals_zero(nstime_t *time) {
+nstime_equals_zero(const nstime_t *time) {
 	int diff = nstime_compare(time, &nstime_zero);
 	assert(diff >= 0);
 	return diff == 0;
--- a/include/jemalloc/internal/psset.h
+++ b/include/jemalloc/internal/psset.h
@ -121,8 +121,12 @@ void psset_update_end(psset_t *psset, hpdata_t *ps);

 /* Analogous to the eset_fit; pick a hpdata to serve the request. */
 hpdata_t *psset_pick_alloc(psset_t *psset, size_t size);
-/* Pick one to purge. */
-hpdata_t *psset_pick_purge(psset_t *psset);
+/*
+ * Pick one to purge that is purgable before given time (inclusive).  If now
+ * is NULL then time is not considered.
+ */
+hpdata_t *psset_pick_purge(psset_t *psset, const nstime_t *now);
+
 /* Pick one to hugify. */
 hpdata_t *psset_pick_hugify(psset_t *psset);