diff --git a/configure.ac b/configure.ac index d037fed5..a330e33e 100644 --- a/configure.ac +++ b/configure.ac @@ -2491,6 +2491,16 @@ if test "x${je_cv_madvise}" = "xyes" ; then if test "x${je_cv_madv_nocore}" = "xyes" ; then AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ]) fi + + dnl Check for madvise(..., MADV_COLLAPSE). + JE_COMPILABLE([madvise(..., MADV_COLLAPSE)], [ +#include +], [ + madvise((void *)0, 0, MADV_COLLAPSE); +], [je_cv_madv_collapse]) + if test "x${je_cv_madv_collapse}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ]) + fi else dnl Check for posix_madvise. JE_COMPILABLE([posix_madvise], [ diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h index 4805efaf..4c410c40 100644 --- a/include/jemalloc/internal/hpa.h +++ b/include/jemalloc/internal/hpa.h @@ -61,6 +61,14 @@ struct hpa_shard_nonderived_stats_s { * Guarded by mtx. */ uint64_t nhugifies; + + /* + * The number of times we've tried to hugify a pageslab, but failed. + * + * Guarded by mtx. + */ + uint64_t nhugify_failures; + /* * The number of times we've dehugified a pageslab. * diff --git a/include/jemalloc/internal/hpa_hooks.h b/include/jemalloc/internal/hpa_hooks.h index 72f3a43c..b04b04f6 100644 --- a/include/jemalloc/internal/hpa_hooks.h +++ b/include/jemalloc/internal/hpa_hooks.h @@ -9,7 +9,7 @@ struct hpa_hooks_s { void *(*map)(size_t size); void (*unmap)(void *ptr, size_t size); void (*purge)(void *ptr, size_t size); - void (*hugify)(void *ptr, size_t size); + bool (*hugify)(void *ptr, size_t size, bool sync); void (*dehugify)(void *ptr, size_t size); void (*curtime)(nstime_t *r_time, bool first_reading); uint64_t (*ms_since)(nstime_t *r_time); diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h index ee2bd40c..42246172 100644 --- a/include/jemalloc/internal/hpa_opts.h +++ b/include/jemalloc/internal/hpa_opts.h @@ -45,6 +45,11 @@ struct hpa_shard_opts_s { */ uint64_t hugify_delay_ms; + /* + * Hugify pages synchronously. + */ + bool hugify_sync; + /* * Minimum amount of time between purges. */ @@ -73,6 +78,8 @@ struct hpa_shard_opts_s { false, \ /* hugify_delay_ms */ \ 10 * 1000, \ + /* hugify_sync */ \ + false, \ /* min_purge_interval_ms */ \ 5 * 1000, \ /* experimental_max_purge_nhp */ \ diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index f5b1a924..5cf77f47 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -308,6 +308,13 @@ */ #undef JEMALLOC_HAVE_MADVISE_HUGE +/* + * Defined if best-effort synchronous collapse of the native + * pages mapped by the memory range into transparent huge pages is supported + * via MADV_COLLAPSE arguments to madvise(2). + */ +#undef JEMALLOC_HAVE_MADVISE_COLLAPSE + /* * Methods for purging unused pages differ between operating systems. * diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in index ebce5d56..a59c3489 100644 --- a/include/jemalloc/internal/jemalloc_preamble.h.in +++ b/include/jemalloc/internal/jemalloc_preamble.h.in @@ -57,6 +57,15 @@ # define JEMALLOC_MADV_FREE 8 #endif +/* + * Can be defined at compile time, in cases, when it is known + * madvise(..., MADV_COLLAPSE) feature is supported, but MADV_COLLAPSE + * constant is not defined. + */ +#ifdef JEMALLOC_DEFINE_MADVISE_COLLAPSE +# define JEMALLOC_MADV_COLLAPSE 25 +#endif + static const bool config_debug = #ifdef JEMALLOC_DEBUG true diff --git a/include/jemalloc/internal/pages.h b/include/jemalloc/internal/pages.h index 6c295b43..0dcf96dc 100644 --- a/include/jemalloc/internal/pages.h +++ b/include/jemalloc/internal/pages.h @@ -123,6 +123,7 @@ bool pages_purge_lazy(void *addr, size_t size); bool pages_purge_forced(void *addr, size_t size); bool pages_huge(void *addr, size_t size); bool pages_nohuge(void *addr, size_t size); +bool pages_collapse(void *addr, size_t size); bool pages_dontdump(void *addr, size_t size); bool pages_dodump(void *addr, size_t size); bool pages_boot(void); diff --git a/src/ctl.c b/src/ctl.c index 690bbabc..40e75fb7 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -103,6 +103,7 @@ CTL_PROTO(opt_hpa) CTL_PROTO(opt_hpa_slab_max_alloc) CTL_PROTO(opt_hpa_hugification_threshold) CTL_PROTO(opt_hpa_hugify_delay_ms) +CTL_PROTO(opt_hpa_hugify_sync) CTL_PROTO(opt_hpa_min_purge_interval_ms) CTL_PROTO(opt_experimental_hpa_max_purge_nhp) CTL_PROTO(opt_hpa_dirty_mult) @@ -263,6 +264,7 @@ INDEX_PROTO(stats_arenas_i_extents_j) CTL_PROTO(stats_arenas_i_hpa_shard_npurge_passes) CTL_PROTO(stats_arenas_i_hpa_shard_npurges) CTL_PROTO(stats_arenas_i_hpa_shard_nhugifies) +CTL_PROTO(stats_arenas_i_hpa_shard_nhugify_failures) CTL_PROTO(stats_arenas_i_hpa_shard_ndehugifies) /* We have a set of stats for full slabs. */ @@ -462,6 +464,7 @@ static const ctl_named_node_t opt_node[] = { {NAME("hpa_hugification_threshold"), CTL(opt_hpa_hugification_threshold)}, {NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)}, + {NAME("hpa_hugify_sync"), CTL(opt_hpa_hugify_sync)}, {NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)}, {NAME("experimental_hpa_max_purge_nhp"), CTL(opt_experimental_hpa_max_purge_nhp)}, @@ -834,6 +837,8 @@ static const ctl_named_node_t stats_arenas_i_hpa_shard_node[] = { {NAME("npurge_passes"), CTL(stats_arenas_i_hpa_shard_npurge_passes)}, {NAME("npurges"), CTL(stats_arenas_i_hpa_shard_npurges)}, {NAME("nhugifies"), CTL(stats_arenas_i_hpa_shard_nhugifies)}, + {NAME("nhugify_failures"), + CTL(stats_arenas_i_hpa_shard_nhugify_failures)}, {NAME("ndehugifies"), CTL(stats_arenas_i_hpa_shard_ndehugifies)} }; @@ -2200,6 +2205,7 @@ CTL_RO_NL_GEN(opt_hpa, opt_hpa, bool) CTL_RO_NL_GEN(opt_hpa_hugification_threshold, opt_hpa_opts.hugification_threshold, size_t) CTL_RO_NL_GEN(opt_hpa_hugify_delay_ms, opt_hpa_opts.hugify_delay_ms, uint64_t) +CTL_RO_NL_GEN(opt_hpa_hugify_sync, opt_hpa_opts.hugify_sync, bool) CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms, uint64_t) CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp, @@ -4061,6 +4067,9 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_npurges, arenas_i(mib[2])->astats->hpastats.nonderived_stats.npurges, uint64_t); CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugifies, arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugifies, uint64_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_nhugify_failures, + arenas_i(mib[2])->astats->hpastats.nonderived_stats.nhugify_failures, + uint64_t); CTL_RO_CGEN(config_stats, stats_arenas_i_hpa_shard_ndehugifies, arenas_i(mib[2])->astats->hpastats.nonderived_stats.ndehugifies, uint64_t); diff --git a/src/hpa.c b/src/hpa.c index d1558821..14541413 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -210,6 +210,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, shard->stats.npurge_passes = 0; shard->stats.npurges = 0; shard->stats.nhugifies = 0; + shard->stats.nhugify_failures = 0; shard->stats.ndehugifies = 0; /* @@ -242,6 +243,7 @@ hpa_shard_nonderived_stats_accum(hpa_shard_nonderived_stats_t *dst, dst->npurge_passes += src->npurge_passes; dst->npurges += src->npurges; dst->nhugifies += src->nhugifies; + dst->nhugify_failures += src->nhugify_failures; dst->ndehugifies += src->ndehugifies; } @@ -499,10 +501,23 @@ hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_unlock(tsdn, &shard->mtx); - shard->central->hooks.hugify(hpdata_addr_get(to_hugify), HUGEPAGE); + bool err = shard->central->hooks.hugify(hpdata_addr_get(to_hugify), + HUGEPAGE, shard->opts.hugify_sync); malloc_mutex_lock(tsdn, &shard->mtx); shard->stats.nhugifies++; + if (err) { + /* + * When asynchronious hugification is used + * (shard->opts.hugify_sync option is false), we are not + * expecting to get here, unless something went terrible wrong. + * Because underlying syscall is only setting kernel flag for + * memory range (actual hugification happens asynchroniously + * and we are not getting any feedback about its outcome), we + * expect syscall to be successful all the time. + */ + shard->stats.nhugify_failures++; + } psset_update_begin(&shard->psset, to_hugify); hpdata_hugify(to_hugify); diff --git a/src/hpa_hooks.c b/src/hpa_hooks.c index f43f05eb..4628c14f 100644 --- a/src/hpa_hooks.c +++ b/src/hpa_hooks.c @@ -6,7 +6,7 @@ static void *hpa_hooks_map(size_t size); static void hpa_hooks_unmap(void *ptr, size_t size); static void hpa_hooks_purge(void *ptr, size_t size); -static void hpa_hooks_hugify(void *ptr, size_t size); +static bool hpa_hooks_hugify(void *ptr, size_t size, bool sync); static void hpa_hooks_dehugify(void *ptr, size_t size); static void hpa_hooks_curtime(nstime_t *r_nstime, bool first_reading); static uint64_t hpa_hooks_ms_since(nstime_t *past_nstime); @@ -37,10 +37,27 @@ hpa_hooks_purge(void *ptr, size_t size) { pages_purge_forced(ptr, size); } -static void -hpa_hooks_hugify(void *ptr, size_t size) { +static bool +hpa_hooks_hugify(void *ptr, size_t size, bool sync) { + /* + * We mark memory range as huge independently on which hugification + * technique is used (synchronous or asynchronous) to have correct + * VmFlags set for introspection and accounting purposes. If + * synchronous hugification is enabled and pages_collapse call fails, + * then we hope memory range will be hugified asynchronously by + * khugepaged eventually. Right now, 3 out of 4 error return codes of + * madvise(..., MADV_COLLAPSE) are retryable. Instead of retrying, we + * just fallback to asynchronous khugepaged hugification to simplify + * implementation, even if we might know khugepaged fallback will not + * be successful (current madvise(..., MADV_COLLAPSE) implementation + * hints, when EINVAL is returned it is likely that khugepaged won't be + * able to collapse memory range into hugepage either). + */ bool err = pages_huge(ptr, size); - (void)err; + if (sync) { + err = pages_collapse(ptr, size); + } + return err; } static void diff --git a/src/jemalloc.c b/src/jemalloc.c index 428a50ef..248de28b 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1093,6 +1093,15 @@ validate_hpa_settings(void) { if (opt_hpa_opts.dirty_mult != (fxp_t)-1 && validate_hpa_ratios()) { had_conf_error = true; } +#ifndef JEMALLOC_HAVE_MADVISE_COLLAPSE + if (opt_hpa_opts.hugify_sync) { + had_conf_error = true; + malloc_printf( + ": hpa_hugify_sync config option is enabled, " + "but MADV_COLLAPSE support was not detected at build " + "time."); + } +#endif } static void @@ -1566,6 +1575,9 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], 0, 0, CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false); + CONF_HANDLE_BOOL( + opt_hpa_opts.hugify_sync, "hpa_hugify_sync"); + CONF_HANDLE_UINT64_T( opt_hpa_opts.min_purge_interval_ms, "hpa_min_purge_interval_ms", 0, 0, diff --git a/src/pages.c b/src/pages.c index 5b55a046..26fd8d5d 100644 --- a/src/pages.c +++ b/src/pages.c @@ -567,6 +567,30 @@ pages_nohuge_unaligned(void *addr, size_t size) { return pages_nohuge_impl(addr, size, false); } +bool +pages_collapse(void *addr, size_t size) { + assert(PAGE_ADDR2BASE(addr) == addr); + assert(PAGE_CEILING(size) == size); + /* + * There is one more MADV_COLLAPSE precondition that is not easy to + * express with assert statement. In order to madvise(addr, size, + * MADV_COLLAPSE) call to be successful, at least one page in the range + * must currently be backed by physical memory. In particularly, this + * means we can't call pages_collapse on freshly mapped memory region. + * See madvise(2) man page for more details. + */ +#if defined(JEMALLOC_HAVE_MADVISE_COLLAPSE) && \ + (defined(MADV_COLLAPSE) || defined(JEMALLOC_MADV_COLLAPSE)) +# if defined(MADV_COLLAPSE) + return (madvise(addr, size, MADV_COLLAPSE) != 0); +# elif defined(JEMALLOC_MADV_COLLAPSE) + return (madvise(addr, size, JEMALLOC_MADV_COLLAPSE) != 0); +# endif +#else + return true; +#endif +} + bool pages_dontdump(void *addr, size_t size) { assert(PAGE_ADDR2BASE(addr) == addr); diff --git a/src/stats.c b/src/stats.c index 89dd1916..7fbaa5cc 100644 --- a/src/stats.c +++ b/src/stats.c @@ -844,6 +844,7 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i, uint64_t npurge_passes; uint64_t npurges; uint64_t nhugifies; + uint64_t nhugify_failures; uint64_t ndehugifies; CTL_M2_GET("stats.arenas.0.hpa_shard.npurge_passes", @@ -852,6 +853,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i, i, &npurges, uint64_t); CTL_M2_GET("stats.arenas.0.hpa_shard.nhugifies", i, &nhugifies, uint64_t); + CTL_M2_GET("stats.arenas.0.hpa_shard.nhugify_failures", + i, &nhugify_failures, uint64_t); CTL_M2_GET("stats.arenas.0.hpa_shard.ndehugifies", i, &ndehugifies, uint64_t); @@ -860,11 +863,13 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i, " Purge passes: %" FMTu64 " (%" FMTu64 " / sec)\n" " Purges: %" FMTu64 " (%" FMTu64 " / sec)\n" " Hugeifies: %" FMTu64 " (%" FMTu64 " / sec)\n" + " Hugify failures: %" FMTu64 " (%" FMTu64 " / sec)\n" " Dehugifies: %" FMTu64 " (%" FMTu64 " / sec)\n" "\n", npurge_passes, rate_per_second(npurge_passes, uptime), npurges, rate_per_second(npurges, uptime), nhugifies, rate_per_second(nhugifies, uptime), + nhugify_failures, rate_per_second(nhugify_failures, uptime), ndehugifies, rate_per_second(ndehugifies, uptime)); emitter_json_kv(emitter, "npurge_passes", emitter_type_uint64, @@ -873,6 +878,8 @@ stats_arena_hpa_shard_counters_print(emitter_t *emitter, unsigned i, &npurges); emitter_json_kv(emitter, "nhugifies", emitter_type_uint64, &nhugifies); + emitter_json_kv(emitter, "nhugify_failures", emitter_type_uint64, + &nhugify_failures); emitter_json_kv(emitter, "ndehugifies", emitter_type_uint64, &ndehugifies); } @@ -1578,6 +1585,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_SIZE_T("hpa_slab_max_alloc") OPT_WRITE_SIZE_T("hpa_hugification_threshold") OPT_WRITE_UINT64("hpa_hugify_delay_ms") + OPT_WRITE_BOOL("hpa_hugify_sync") OPT_WRITE_UINT64("hpa_min_purge_interval_ms") OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp") if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0) diff --git a/test/unit/hpa.c b/test/unit/hpa.c index 747f98ef..50b96a87 100644 --- a/test/unit/hpa.c +++ b/test/unit/hpa.c @@ -32,6 +32,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { false, /* hugify_delay_ms */ 10 * 1000, + /* hugify_sync */ + false, /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ @@ -49,6 +51,8 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = { true, /* hugify_delay_ms */ 0, + /* hugify_sync */ + false, /* min_purge_interval_ms */ 5 * 1000, /* experimental_max_purge_nhp */ @@ -371,9 +375,10 @@ defer_test_purge(void *ptr, size_t size) { } static size_t ndefer_hugify_calls = 0; -static void -defer_test_hugify(void *ptr, size_t size) { +static bool +defer_test_hugify(void *ptr, size_t size, bool sync) { ++ndefer_hugify_calls; + return false; } static size_t ndefer_dehugify_calls = 0; diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index 65e84370..8c8fb18c 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -288,6 +288,7 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(const char *, dss, always); TEST_MALLCTL_OPT(bool, hpa, always); TEST_MALLCTL_OPT(size_t, hpa_slab_max_alloc, always); + TEST_MALLCTL_OPT(bool, hpa_hugify_sync, always); TEST_MALLCTL_OPT(size_t, hpa_sec_nshards, always); TEST_MALLCTL_OPT(size_t, hpa_sec_max_alloc, always); TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);