Add opt hpa_hugify_sync to hugify synchronously

Linux 6.1 introduced `MADV_COLLAPSE` flag to perform a best-effort synchronous collapse of the native pages mapped by the memory range into transparent huge pages. Synchronous hugification might be beneficial for at least two reasons: we are not relying on khugepaged anymore and get an instant feedback if range wasn't hugified. If `hpa_hugify_sync` option is on, we'll try to perform synchronously collapse and if it wasn't successful, we'll fallback to asynchronous behaviour.
2026-07-14 12:47:27 +03:00 · 2024-10-31 11:43:11 -07:00 · 2024-10-31 11:43:11 -07:00 · 0ce13c6fb5
commit 0ce13c6fb5
parent a361e886e2
15 changed files with 141 additions and 8 deletions
--- a/configure.ac
+++ b/configure.ac
@ -2491,6 +2491,16 @@ if test "x${je_cv_madvise}" = "xyes" ; then
  if test "x${je_cv_madv_nocore}" = "xyes" ; then
    AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ])
  fi
+
+  dnl Check for madvise(..., MADV_COLLAPSE).
+  JE_COMPILABLE([madvise(..., MADV_COLLAPSE)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_COLLAPSE);
+], [je_cv_madv_collapse])
+  if test "x${je_cv_madv_collapse}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ])
+  fi
 else
  dnl Check for posix_madvise.
  JE_COMPILABLE([posix_madvise], [
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -61,6 +61,14 @@ struct hpa_shard_nonderived_stats_s {
 	 * Guarded by mtx.
 	 */
 	uint64_t nhugifies;
+
+	/*
+	 * The number of times we've tried to hugify a pageslab, but failed.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t nhugify_failures;
+
 	/*
 	 * The number of times we've dehugified a pageslab.
 	 *
--- a/include/jemalloc/internal/hpa_hooks.h
+++ b/include/jemalloc/internal/hpa_hooks.h
@ -9,7 +9,7 @@ struct hpa_hooks_s {
 	void *(*map)(size_t size);
 	void (*unmap)(void *ptr, size_t size);
 	void (*purge)(void *ptr, size_t size);
-	void (*hugify)(void *ptr, size_t size);
+	bool (*hugify)(void *ptr, size_t size, bool sync);
 	void (*dehugify)(void *ptr, size_t size);
 	void (*curtime)(nstime_t *r_time, bool first_reading);
 	uint64_t (*ms_since)(nstime_t *r_time);
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -45,6 +45,11 @@ struct hpa_shard_opts_s {
 	 */
 	uint64_t hugify_delay_ms;

+	/*
+	 * Hugify pages synchronously.
+	 */
+	bool hugify_sync;
+
 	/*
 	 * Minimum amount of time between purges.
 	 */
@ -73,6 +78,8 @@ struct hpa_shard_opts_s {
 	false,								\
 	/* hugify_delay_ms */						\
 	10 * 1000,							\
+	/* hugify_sync */						\
+	false,								\
 	/* min_purge_interval_ms */					\
 	5 * 1000,							\
 	/* experimental_max_purge_nhp */				\
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -308,6 +308,13 @@
 */
 #undef JEMALLOC_HAVE_MADVISE_HUGE

+/*
+ * Defined if best-effort synchronous collapse of the native
+ * pages mapped by the memory range into transparent huge pages is supported
+ * via MADV_COLLAPSE arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_COLLAPSE
+
 /*
 * Methods for purging unused pages differ between operating systems.
 *
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@ -57,6 +57,15 @@
 #  define JEMALLOC_MADV_FREE 8
 #endif

+/*
+ * Can be defined at compile time, in cases, when it is known
+ * madvise(..., MADV_COLLAPSE) feature is supported, but MADV_COLLAPSE
+ * constant is not defined.
+ */
+#ifdef JEMALLOC_DEFINE_MADVISE_COLLAPSE
+#  define JEMALLOC_MADV_COLLAPSE 25
+#endif
+
 static const bool config_debug =
 #ifdef JEMALLOC_DEBUG
    true
--- a/include/jemalloc/internal/pages.h
+++ b/include/jemalloc/internal/pages.h
@ -123,6 +123,7 @@ bool pages_purge_lazy(void *addr, size_t size);
 bool pages_purge_forced(void *addr, size_t size);
 bool pages_huge(void *addr, size_t size);
 bool pages_nohuge(void *addr, size_t size);
+bool pages_collapse(void *addr, size_t size);
 bool pages_dontdump(void *addr, size_t size);
 bool pages_dodump(void *addr, size_t size);
 bool pages_boot(void);
--- a/src/ctl.c
+++ b/src/ctl.c
@ -103,6 +103,7 @@ CTL_PROTO(opt_hpa)
 CTL_PROTO(opt_hpa_slab_max_alloc)
 CTL_PROTO(opt_hpa_hugification_threshold)
 CTL_PROTO(opt_hpa_hugify_delay_ms)
+CTL_PROTO(opt_hpa_hugify_sync)
 CTL_PROTO(opt_hpa_min_purge_interval_ms)
 CTL_PROTO(opt_experimental_hpa_max_purge_nhp)
 CTL_PROTO(opt_hpa_dirty_mult)
@ -263,6 +264,7 @@ INDEX_PROTO(stats_arenas_i_extents_j)
 CTL_PROTO(stats_arenas_i_hpa_shard_npurge_passes)
 CTL_PROTO(stats_arenas_i_hpa_shard_npurges)
 CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies)
+CTL_PROTO(stats_arenas_i_hpa_shard_nhugify_failures)
 CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies)

 /* We have a set of stats for full slabs. */
@ -462,6 +464,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("hpa_hugification_threshold"),
 		CTL(opt_hpa_hugification_threshold)},
 	{NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)},
+	{NAME("hpa_hugify_sync"), CTL(opt_hpa_hugify_sync)},
 	{NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)},
 	{NAME("experimental_hpa_max_purge_nhp"),
 		CTL(opt_experimental_hpa_max_purge_nhp)},
@ -834,6 +837,8 @@ static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = {
 	{NAME("npurge_passes"),	CTL(stats_arenas_i_hpa_shard_npurge_passes)},
 	{NAME("npurges"),	CTL(stats_arenas_i_hpa_shard_npurges)},
 	{NAME("nhugifies"),	CTL(stats_arenas_i_hpa_shard_nhugifies)},
+	{NAME("nhugify_failures"),
+	    CTL(stats_arenas_i_hpa_shard_nhugify_failures)},
 	{NAME("ndehugifies"),	CTL(stats_arenas_i_hpa_shard_ndehugifies)}
 };

@ -2200,6 +2205,7 @@ CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool)
 CTL_RO_NL_GEN(opt_hpa_hugification_threshold,
    opt_hpa_opts.hugification_threshold, size_t)
 CTL_RO_NL_GEN(opt_hpa_hugify_delay_ms, opt_hpa_opts.hugify_delay_ms, uint64_t)
+CTL_RO_NL_GEN(opt_hpa_hugify_sync, opt_hpa_opts.hugify_sync, bool)
 CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms,
    uint64_t)
 CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp,
@ -4061,6 +4067,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurges,
    arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurges, uint64_t);
 CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugifies,
    arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugifies, uint64_t);
+CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugify_failures,
+    arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugify_failures,
+    uint64_t);
 CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies,
    arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t);

--- a/src/hpa.c
+++ b/src/hpa.c
@ -210,6 +210,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
 	shard->stats.npurge_passes = 0;
 	shard->stats.npurges = 0;
 	shard->stats.nhugifies = 0;
+	shard->stats.nhugify_failures = 0;
 	shard->stats.ndehugifies = 0;

 	/*
@ -242,6 +243,7 @@ hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
 	dst->npurge_passes += src->npurge_passes;
 	dst->npurges += src->npurges;
 	dst->nhugifies += src->nhugifies;
+	dst->nhugify_failures += src->nhugify_failures;
 	dst->ndehugifies += src->ndehugifies;
 }

@ -499,10 +501,23 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {

 	malloc_mutex_unlock(tsdn, &shard->mtx);

-	shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
+	bool err = shard->central->hooks.hugify(hpdata_addr_get(to_hugify),
+	    HUGEPAGE, shard->opts.hugify_sync);

 	malloc_mutex_lock(tsdn, &shard->mtx);
 	shard->stats.nhugifies++;
+	if (err) {
+		/*
+		 * When asynchronious hugification is used
+		 * (shard->opts.hugify_sync option is false), we are not
+		 * expecting to get here, unless something went terrible wrong.
+		 * Because underlying syscall is only setting kernel flag for
+		 * memory range (actual hugification happens asynchroniously
+		 * and we are not getting any feedback about its outcome), we
+		 * expect syscall to be successful all the time.
+		 */
+		shard->stats.nhugify_failures++;
+	}

 	psset_update_begin(&shard->psset, to_hugify);
 	hpdata_hugify(to_hugify);
--- a/src/hpa_hooks.c
+++ b/src/hpa_hooks.c
@ -6,7 +6,7 @@
 static void *hpa_hooks_map(size_t size);
 static void hpa_hooks_unmap(void *ptr, size_t size);
 static void hpa_hooks_purge(void *ptr, size_t size);
-static void hpa_hooks_hugify(void *ptr, size_t size);
+static bool hpa_hooks_hugify(void *ptr, size_t size, bool sync);
 static void hpa_hooks_dehugify(void *ptr, size_t size);
 static void hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading);
 static uint64_t hpa_hooks_ms_since(nstime_t *past_nstime);
@ -37,10 +37,27 @@ hpa_hooks_purge(void *ptr, size_t size) {
 	pages_purge_forced(ptr, size);
 }

-static void
-hpa_hooks_hugify(void *ptr, size_t size) {
+static bool
+hpa_hooks_hugify(void *ptr, size_t size, bool sync) {
+	/*
+	 * We mark memory range as huge independently on which hugification
+	 * technique is used (synchronous or asynchronous) to have correct
+	 * VmFlags set for introspection and accounting purposes.  If
+	 * synchronous hugification is enabled and pages_collapse call fails,
+	 * then we hope memory range will be hugified asynchronously by
+	 * khugepaged eventually.  Right now, 3 out of 4 error return codes of
+	 * madvise(..., MADV_COLLAPSE) are retryable.  Instead of retrying, we
+	 * just fallback to asynchronous khugepaged hugification to simplify
+	 * implementation, even if we might know khugepaged fallback will not
+	 * be successful (current madvise(..., MADV_COLLAPSE) implementation
+	 * hints, when EINVAL is returned it is likely that khugepaged won't be
+	 * able to collapse memory range into hugepage either).
+	 */
 	bool err = pages_huge(ptr, size);
-	(void)err;
+	if (sync) {
+		err = pages_collapse(ptr, size);
+	}
+	return err;
 }

 static void
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@ -1093,6 +1093,15 @@ validate_hpa_settings(void) {
 	if (opt_hpa_opts.dirty_mult != (fxp_t)-1 && validate_hpa_ratios()) {
 		had_conf_error = true;
 	}
+#ifndef JEMALLOC_HAVE_MADVISE_COLLAPSE
+	if (opt_hpa_opts.hugify_sync) {
+	       had_conf_error = true;
+	       malloc_printf(
+		   "<jemalloc>: hpa_hugify_sync config option is enabled, "
+		   "but MADV_COLLAPSE support was not detected at build "
+		   "time.");
+	}
+#endif
 }

 static void
@ -1566,6 +1575,9 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    0, 0, CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX,
 			    false);

+			CONF_HANDLE_BOOL(
+			    opt_hpa_opts.hugify_sync, "hpa_hugify_sync");
+
 			CONF_HANDLE_UINT64_T(
 			    opt_hpa_opts.min_purge_interval_ms,
 			    "hpa_min_purge_interval_ms", 0, 0,
--- a/src/pages.c
+++ b/src/pages.c
@ -567,6 +567,30 @@ pages_nohuge_unaligned(void *addr, size_t size) {
 	return pages_nohuge_impl(addr, size, false);
 }

+bool
+pages_collapse(void *addr, size_t size) {
+	assert(PAGE_ADDR2BASE(addr) == addr);
+	assert(PAGE_CEILING(size) == size);
+	/*
+	 * There is one more MADV_COLLAPSE precondition that is not easy to
+	 * express with assert statement.  In order to madvise(addr, size,
+	 * MADV_COLLAPSE) call to be successful, at least one page in the range
+	 * must currently be backed by physical memory.  In particularly, this
+	 * means we can't call pages_collapse on freshly mapped memory region.
+	 * See madvise(2) man page for more details.
+	 */
+#if defined(JEMALLOC_HAVE_MADVISE_COLLAPSE) && \
+    (defined(MADV_COLLAPSE) || defined(JEMALLOC_MADV_COLLAPSE))
+#  if defined(MADV_COLLAPSE)
+	return (madvise(addr, size, MADV_COLLAPSE) != 0);
+#  elif defined(JEMALLOC_MADV_COLLAPSE)
+	return (madvise(addr, size, JEMALLOC_MADV_COLLAPSE) != 0);
+#  endif
+#else
+	return true;
+#endif
+}
+
 bool
 pages_dontdump(void *addr, size_t size) {
 	assert(PAGE_ADDR2BASE(addr) == addr);
--- a/src/stats.c
+++ b/src/stats.c
@ -844,6 +844,7 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
 	uint64_t npurge_passes;
 	uint64_t npurges;
 	uint64_t nhugifies;
+	uint64_t nhugify_failures;
 	uint64_t ndehugifies;

 	CTL_M2_GET("stats.arenas.0.hpa_shard.npurge_passes",
@ -852,6 +853,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
 	    i, &npurges, uint64_t);
 	CTL_M2_GET("stats.arenas.0.hpa_shard.nhugifies",
 	    i, &nhugifies, uint64_t);
+	CTL_M2_GET("stats.arenas.0.hpa_shard.nhugify_failures",
+	    i, &nhugify_failures, uint64_t);
 	CTL_M2_GET("stats.arenas.0.hpa_shard.ndehugifies",
 	    i, &ndehugifies, uint64_t);

@ -860,11 +863,13 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
 	    "  Purge passes: %" FMTu64 " (%" FMTu64 " / sec)\n"
 	    "  Purges: %" FMTu64 " (%" FMTu64 " / sec)\n"
 	    "  Hugeifies: %" FMTu64 " (%" FMTu64 " / sec)\n"
+	    "  Hugify failures: %" FMTu64 " (%" FMTu64 " / sec)\n"
 	    "  Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n"
 	    "\n",
 	    npurge_passes, rate_per_second(npurge_passes, uptime),
 	    npurges, rate_per_second(npurges, uptime),
 	    nhugifies, rate_per_second(nhugifies, uptime),
+	    nhugify_failures, rate_per_second(nhugify_failures, uptime),
 	    ndehugifies, rate_per_second(ndehugifies, uptime));

 	emitter_json_kv(emitter, "npurge_passes", emitter_type_uint64,
@ -873,6 +878,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
 	    &npurges);
 	emitter_json_kv(emitter, "nhugifies", emitter_type_uint64,
 	    &nhugifies);
+	emitter_json_kv(emitter, "nhugify_failures", emitter_type_uint64,
+	    &nhugify_failures);
 	emitter_json_kv(emitter, "ndehugifies", emitter_type_uint64,
 	    &ndehugifies);
 }
@ -1578,6 +1585,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_SIZE_T("hpa_slab_max_alloc")
 	OPT_WRITE_SIZE_T("hpa_hugification_threshold")
 	OPT_WRITE_UINT64("hpa_hugify_delay_ms")
+	OPT_WRITE_BOOL("hpa_hugify_sync")
 	OPT_WRITE_UINT64("hpa_min_purge_interval_ms")
 	OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp")
 	if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0)
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@ -32,6 +32,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
 	false,
 	/* hugify_delay_ms */
 	10 * 1000,
+	/* hugify_sync */
+	false,
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* experimental_max_purge_nhp */
@ -49,6 +51,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = {
 	true,
 	/* hugify_delay_ms */
 	0,
+	/* hugify_sync */
+	false,
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* experimental_max_purge_nhp */
@ -371,9 +375,10 @@ defer_test_purge(void *ptr, size_t size) {
 }

 static size_t ndefer_hugify_calls = 0;
-static void
-defer_test_hugify(void *ptr, size_t size) {
+static bool
+defer_test_hugify(void *ptr, size_t size, bool sync) {
 	++ndefer_hugify_calls;
+	return false;
 }

 static size_t ndefer_dehugify_calls = 0;
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@ -288,6 +288,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(const char *, dss, always);
 	TEST_MALLCTL_OPT(bool, hpa, always);
 	TEST_MALLCTL_OPT(size_t, hpa_slab_max_alloc, always);
+	TEST_MALLCTL_OPT(bool, hpa_hugify_sync, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_nshards, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_max_alloc, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);