Add opt hpa_hugify_sync to hugify synchronously

Linux 6.1 introduced `MADV_COLLAPSE` flag to perform a best-effort
synchronous collapse of the native pages mapped by the memory range into
transparent huge pages.

Synchronous hugification might be beneficial for at least two reasons:
we are not relying on khugepaged anymore and get an instant feedback if
range wasn't hugified.

If `hpa_hugify_sync` option is on, we'll try to perform synchronously
collapse and if it wasn't successful, we'll fallback to asynchronous
behaviour.
This commit is contained in:
Dmitry Ilvokhin 2024-10-31 11:43:11 -07:00 committed by stanjo74
parent a361e886e2
commit 0ce13c6fb5
15 changed files with 141 additions and 8 deletions

View file

@ -2491,6 +2491,16 @@ if test "x${je_cv_madvise}" = "xyes" ; then
if test "x${je_cv_madv_nocore}" = "xyes" ; then
AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ])
fi
dnl Check for madvise(..., MADV_COLLAPSE).
JE_COMPILABLE([madvise(..., MADV_COLLAPSE)], [
#include <sys/mman.h>
], [
madvise((void *)0, 0, MADV_COLLAPSE);
], [je_cv_madv_collapse])
if test "x${je_cv_madv_collapse}" = "xyes" ; then
AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ])
fi
else
dnl Check for posix_madvise.
JE_COMPILABLE([posix_madvise], [

View file

@ -61,6 +61,14 @@ struct hpa_shard_nonderived_stats_s {
* Guarded by mtx.
*/
uint64_t nhugifies;
/*
* The number of times we've tried to hugify a pageslab, but failed.
*
* Guarded by mtx.
*/
uint64_t nhugify_failures;
/*
* The number of times we've dehugified a pageslab.
*

View file

@ -9,7 +9,7 @@ struct hpa_hooks_s {
void *(*map)(size_t size);
void (*unmap)(void *ptr, size_t size);
void (*purge)(void *ptr, size_t size);
void (*hugify)(void *ptr, size_t size);
bool (*hugify)(void *ptr, size_t size, bool sync);
void (*dehugify)(void *ptr, size_t size);
void (*curtime)(nstime_t *r_time, bool first_reading);
uint64_t (*ms_since)(nstime_t *r_time);

View file

@ -45,6 +45,11 @@ struct hpa_shard_opts_s {
*/
uint64_t hugify_delay_ms;
/*
* Hugify pages synchronously.
*/
bool hugify_sync;
/*
* Minimum amount of time between purges.
*/
@ -73,6 +78,8 @@ struct hpa_shard_opts_s {
false, \
/* hugify_delay_ms */ \
10 * 1000, \
/* hugify_sync */ \
false, \
/* min_purge_interval_ms */ \
5 * 1000, \
/* experimental_max_purge_nhp */ \

View file

@ -308,6 +308,13 @@
*/
#undef JEMALLOC_HAVE_MADVISE_HUGE
/*
* Defined if best-effort synchronous collapse of the native
* pages mapped by the memory range into transparent huge pages is supported
* via MADV_COLLAPSE arguments to madvise(2).
*/
#undef JEMALLOC_HAVE_MADVISE_COLLAPSE
/*
* Methods for purging unused pages differ between operating systems.
*

View file

@ -57,6 +57,15 @@
# define JEMALLOC_MADV_FREE 8
#endif
/*
* Can be defined at compile time, in cases, when it is known
* madvise(..., MADV_COLLAPSE) feature is supported, but MADV_COLLAPSE
* constant is not defined.
*/
#ifdef JEMALLOC_DEFINE_MADVISE_COLLAPSE
# define JEMALLOC_MADV_COLLAPSE 25
#endif
static const bool config_debug =
#ifdef JEMALLOC_DEBUG
true

View file

@ -123,6 +123,7 @@ bool pages_purge_lazy(void *addr, size_t size);
bool pages_purge_forced(void *addr, size_t size);
bool pages_huge(void *addr, size_t size);
bool pages_nohuge(void *addr, size_t size);
bool pages_collapse(void *addr, size_t size);
bool pages_dontdump(void *addr, size_t size);
bool pages_dodump(void *addr, size_t size);
bool pages_boot(void);

View file

@ -103,6 +103,7 @@ CTL_PROTO(opt_hpa)
CTL_PROTO(opt_hpa_slab_max_alloc)
CTL_PROTO(opt_hpa_hugification_threshold)
CTL_PROTO(opt_hpa_hugify_delay_ms)
CTL_PROTO(opt_hpa_hugify_sync)
CTL_PROTO(opt_hpa_min_purge_interval_ms)
CTL_PROTO(opt_experimental_hpa_max_purge_nhp)
CTL_PROTO(opt_hpa_dirty_mult)
@ -263,6 +264,7 @@ INDEX_PROTO(stats_arenas_i_extents_j)
CTL_PROTO(stats_arenas_i_hpa_shard_npurge_passes)
CTL_PROTO(stats_arenas_i_hpa_shard_npurges)
CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies)
CTL_PROTO(stats_arenas_i_hpa_shard_nhugify_failures)
CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies)
/* We have a set of stats for full slabs. */
@ -462,6 +464,7 @@ static const ctl_named_node_t opt_node[] = {
{NAME("hpa_hugification_threshold"),
CTL(opt_hpa_hugification_threshold)},
{NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)},
{NAME("hpa_hugify_sync"), CTL(opt_hpa_hugify_sync)},
{NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)},
{NAME("experimental_hpa_max_purge_nhp"),
CTL(opt_experimental_hpa_max_purge_nhp)},
@ -834,6 +837,8 @@ static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = {
{NAME("npurge_passes"), CTL(stats_arenas_i_hpa_shard_npurge_passes)},
{NAME("npurges"), CTL(stats_arenas_i_hpa_shard_npurges)},
{NAME("nhugifies"), CTL(stats_arenas_i_hpa_shard_nhugifies)},
{NAME("nhugify_failures"),
CTL(stats_arenas_i_hpa_shard_nhugify_failures)},
{NAME("ndehugifies"), CTL(stats_arenas_i_hpa_shard_ndehugifies)}
};
@ -2200,6 +2205,7 @@ CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool)
CTL_RO_NL_GEN(opt_hpa_hugification_threshold,
opt_hpa_opts.hugification_threshold, size_t)
CTL_RO_NL_GEN(opt_hpa_hugify_delay_ms, opt_hpa_opts.hugify_delay_ms, uint64_t)
CTL_RO_NL_GEN(opt_hpa_hugify_sync, opt_hpa_opts.hugify_sync, bool)
CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms,
uint64_t)
CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp,
@ -4061,6 +4067,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurges,
arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurges, uint64_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugifies,
arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugifies, uint64_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugify_failures,
arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugify_failures,
uint64_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies,
arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t);

View file

@ -210,6 +210,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
shard->stats.npurge_passes = 0;
shard->stats.npurges = 0;
shard->stats.nhugifies = 0;
shard->stats.nhugify_failures = 0;
shard->stats.ndehugifies = 0;
/*
@ -242,6 +243,7 @@ hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst,
dst->npurge_passes += src->npurge_passes;
dst->npurges += src->npurges;
dst->nhugifies += src->nhugifies;
dst->nhugify_failures += src->nhugify_failures;
dst->ndehugifies += src->ndehugifies;
}
@ -499,10 +501,23 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
malloc_mutex_unlock(tsdn, &shard->mtx);
shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE);
bool err = shard->central->hooks.hugify(hpdata_addr_get(to_hugify),
HUGEPAGE, shard->opts.hugify_sync);
malloc_mutex_lock(tsdn, &shard->mtx);
shard->stats.nhugifies++;
if (err) {
/*
* When asynchronious hugification is used
* (shard->opts.hugify_sync option is false), we are not
* expecting to get here, unless something went terrible wrong.
* Because underlying syscall is only setting kernel flag for
* memory range (actual hugification happens asynchroniously
* and we are not getting any feedback about its outcome), we
* expect syscall to be successful all the time.
*/
shard->stats.nhugify_failures++;
}
psset_update_begin(&shard->psset, to_hugify);
hpdata_hugify(to_hugify);

View file

@ -6,7 +6,7 @@
static void *hpa_hooks_map(size_t size);
static void hpa_hooks_unmap(void *ptr, size_t size);
static void hpa_hooks_purge(void *ptr, size_t size);
static void hpa_hooks_hugify(void *ptr, size_t size);
static bool hpa_hooks_hugify(void *ptr, size_t size, bool sync);
static void hpa_hooks_dehugify(void *ptr, size_t size);
static void hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading);
static uint64_t hpa_hooks_ms_since(nstime_t *past_nstime);
@ -37,10 +37,27 @@ hpa_hooks_purge(void *ptr, size_t size) {
pages_purge_forced(ptr, size);
}
static void
hpa_hooks_hugify(void *ptr, size_t size) {
static bool
hpa_hooks_hugify(void *ptr, size_t size, bool sync) {
/*
* We mark memory range as huge independently on which hugification
* technique is used (synchronous or asynchronous) to have correct
* VmFlags set for introspection and accounting purposes. If
* synchronous hugification is enabled and pages_collapse call fails,
* then we hope memory range will be hugified asynchronously by
* khugepaged eventually. Right now, 3 out of 4 error return codes of
* madvise(..., MADV_COLLAPSE) are retryable. Instead of retrying, we
* just fallback to asynchronous khugepaged hugification to simplify
* implementation, even if we might know khugepaged fallback will not
* be successful (current madvise(..., MADV_COLLAPSE) implementation
* hints, when EINVAL is returned it is likely that khugepaged won't be
* able to collapse memory range into hugepage either).
*/
bool err = pages_huge(ptr, size);
(void)err;
if (sync) {
err = pages_collapse(ptr, size);
}
return err;
}
static void

View file

@ -1093,6 +1093,15 @@ validate_hpa_settings(void) {
if (opt_hpa_opts.dirty_mult != (fxp_t)-1 && validate_hpa_ratios()) {
had_conf_error = true;
}
#ifndef JEMALLOC_HAVE_MADVISE_COLLAPSE
if (opt_hpa_opts.hugify_sync) {
had_conf_error = true;
malloc_printf(
"<jemalloc>: hpa_hugify_sync config option is enabled, "
"but MADV_COLLAPSE support was not detected at build "
"time.");
}
#endif
}
static void
@ -1566,6 +1575,9 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
0, 0, CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX,
false);
CONF_HANDLE_BOOL(
opt_hpa_opts.hugify_sync, "hpa_hugify_sync");
CONF_HANDLE_UINT64_T(
opt_hpa_opts.min_purge_interval_ms,
"hpa_min_purge_interval_ms", 0, 0,

View file

@ -567,6 +567,30 @@ pages_nohuge_unaligned(void *addr, size_t size) {
return pages_nohuge_impl(addr, size, false);
}
bool
pages_collapse(void *addr, size_t size) {
assert(PAGE_ADDR2BASE(addr) == addr);
assert(PAGE_CEILING(size) == size);
/*
* There is one more MADV_COLLAPSE precondition that is not easy to
* express with assert statement. In order to madvise(addr, size,
* MADV_COLLAPSE) call to be successful, at least one page in the range
* must currently be backed by physical memory. In particularly, this
* means we can't call pages_collapse on freshly mapped memory region.
* See madvise(2) man page for more details.
*/
#if defined(JEMALLOC_HAVE_MADVISE_COLLAPSE) && \
(defined(MADV_COLLAPSE) || defined(JEMALLOC_MADV_COLLAPSE))
# if defined(MADV_COLLAPSE)
return (madvise(addr, size, MADV_COLLAPSE) != 0);
# elif defined(JEMALLOC_MADV_COLLAPSE)
return (madvise(addr, size, JEMALLOC_MADV_COLLAPSE) != 0);
# endif
#else
return true;
#endif
}
bool
pages_dontdump(void *addr, size_t size) {
assert(PAGE_ADDR2BASE(addr) == addr);

View file

@ -844,6 +844,7 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
uint64_t npurge_passes;
uint64_t npurges;
uint64_t nhugifies;
uint64_t nhugify_failures;
uint64_t ndehugifies;
CTL_M2_GET("stats.arenas.0.hpa_shard.npurge_passes",
@ -852,6 +853,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
i, &npurges, uint64_t);
CTL_M2_GET("stats.arenas.0.hpa_shard.nhugifies",
i, &nhugifies, uint64_t);
CTL_M2_GET("stats.arenas.0.hpa_shard.nhugify_failures",
i, &nhugify_failures, uint64_t);
CTL_M2_GET("stats.arenas.0.hpa_shard.ndehugifies",
i, &ndehugifies, uint64_t);
@ -860,11 +863,13 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
" Purge passes: %" FMTu64 " (%" FMTu64 " / sec)\n"
" Purges: %" FMTu64 " (%" FMTu64 " / sec)\n"
" Hugeifies: %" FMTu64 " (%" FMTu64 " / sec)\n"
" Hugify failures: %" FMTu64 " (%" FMTu64 " / sec)\n"
" Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n"
"\n",
npurge_passes, rate_per_second(npurge_passes, uptime),
npurges, rate_per_second(npurges, uptime),
nhugifies, rate_per_second(nhugifies, uptime),
nhugify_failures, rate_per_second(nhugify_failures, uptime),
ndehugifies, rate_per_second(ndehugifies, uptime));
emitter_json_kv(emitter, "npurge_passes", emitter_type_uint64,
@ -873,6 +878,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i,
&npurges);
emitter_json_kv(emitter, "nhugifies", emitter_type_uint64,
&nhugifies);
emitter_json_kv(emitter, "nhugify_failures", emitter_type_uint64,
&nhugify_failures);
emitter_json_kv(emitter, "ndehugifies", emitter_type_uint64,
&ndehugifies);
}
@ -1578,6 +1585,7 @@ stats_general_print(emitter_t *emitter) {
OPT_WRITE_SIZE_T("hpa_slab_max_alloc")
OPT_WRITE_SIZE_T("hpa_hugification_threshold")
OPT_WRITE_UINT64("hpa_hugify_delay_ms")
OPT_WRITE_BOOL("hpa_hugify_sync")
OPT_WRITE_UINT64("hpa_min_purge_interval_ms")
OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp")
if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0)

View file

@ -32,6 +32,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
false,
/* hugify_delay_ms */
10 * 1000,
/* hugify_sync */
false,
/* min_purge_interval_ms */
5 * 1000,
/* experimental_max_purge_nhp */
@ -49,6 +51,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = {
true,
/* hugify_delay_ms */
0,
/* hugify_sync */
false,
/* min_purge_interval_ms */
5 * 1000,
/* experimental_max_purge_nhp */
@ -371,9 +375,10 @@ defer_test_purge(void *ptr, size_t size) {
}
static size_t ndefer_hugify_calls = 0;
static void
defer_test_hugify(void *ptr, size_t size) {
static bool
defer_test_hugify(void *ptr, size_t size, bool sync) {
++ndefer_hugify_calls;
return false;
}
static size_t ndefer_dehugify_calls = 0;

View file

@ -288,6 +288,7 @@ TEST_BEGIN(test_mallctl_opt) {
TEST_MALLCTL_OPT(const char *, dss, always);
TEST_MALLCTL_OPT(bool, hpa, always);
TEST_MALLCTL_OPT(size_t, hpa_slab_max_alloc, always);
TEST_MALLCTL_OPT(bool, hpa_hugify_sync, always);
TEST_MALLCTL_OPT(size_t, hpa_sec_nshards, always);
TEST_MALLCTL_OPT(size_t, hpa_sec_max_alloc, always);
TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);