From 22440a0207cd7d7c624c78723ca1eeb8a4353e79 Mon Sep 17 00:00:00 2001 From: Qi Wang Date: Tue, 4 Feb 2025 18:31:11 -0800 Subject: [PATCH] Implement process_madvise support. Add opt.process_madvise_max_batch which determines if process_madvise is enabled (non-zero) and the max # of regions in each batch. Added another limiting factor which is the space to reserve on stack, which results in the max batch of 128. --- configure.ac | 11 +++ include/jemalloc/internal/extent.h | 12 ++++ .../internal/jemalloc_internal_defs.h.in | 3 + .../jemalloc/internal/jemalloc_preamble.h.in | 7 ++ include/jemalloc/internal/pages.h | 1 + include/jemalloc/internal/typed_list.h | 4 ++ src/ctl.c | 4 ++ src/extent.c | 36 ++++++++-- src/jemalloc.c | 5 ++ src/pac.c | 67 ++++++++++++++++++- src/pages.c | 58 ++++++++++++++++ src/stats.c | 1 + test/unit/mallctl.c | 1 + 13 files changed, 204 insertions(+), 6 deletions(-) diff --git a/configure.ac b/configure.ac index a55a5a08..eb500db9 100644 --- a/configure.ac +++ b/configure.ac @@ -2544,6 +2544,17 @@ if test "x${je_cv_madvise}" = "xyes" ; then if test "x${je_cv_madv_collapse}" = "xyes" ; then AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ]) fi + + dnl Check for process_madvise + JE_COMPILABLE([process_madvise(2)], [ +#include +#include +], [ + syscall(SYS_process_madvise, 0, (void *)0, 0, 0, 0); +], [je_cv_process_madvise]) + if test "x${je_cv_process_madvise}" = "xyes" ; then + AC_DEFINE([JEMALLOC_HAVE_PROCESS_MADVISE], [ ], [ ]) + fi else dnl Check for posix_madvise. JE_COMPILABLE([posix_madvise], [ diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h index 17feb703..be61db8d 100644 --- a/include/jemalloc/internal/extent.h +++ b/include/jemalloc/internal/extent.h @@ -21,6 +21,16 @@ #define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6 extern size_t opt_lg_extent_max_active_fit; +#define PROCESS_MADVISE_MAX_BATCH_DEFAULT 0 +extern size_t opt_process_madvise_max_batch; + +#ifdef JEMALLOC_HAVE_PROCESS_MADVISE +/* The iovec is on stack. Limit the max batch to avoid stack overflow. */ +#define PROCESS_MADVISE_MAX_BATCH_LIMIT (VARIABLE_ARRAY_SIZE_MAX / sizeof(struct iovec)) +#else +#define PROCESS_MADVISE_MAX_BATCH_LIMIT 0 +#endif + edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment, bool zero, bool guarded); @@ -42,6 +52,8 @@ edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, bool growing_retained); void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata); +void extent_dalloc_wrapper_purged(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata); void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata); bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in index e76eaaf4..2e47438a 100644 --- a/include/jemalloc/internal/jemalloc_internal_defs.h.in +++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in @@ -345,6 +345,9 @@ */ #undef JEMALLOC_MADVISE_NOCORE +/* Defined if process_madvise(2) is available. */ +#undef JEMALLOC_HAVE_PROCESS_MADVISE + /* Defined if mprotect(2) is available. */ #undef JEMALLOC_HAVE_MPROTECT diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in index ef637a2d..eba475a6 100644 --- a/include/jemalloc/internal/jemalloc_preamble.h.in +++ b/include/jemalloc/internal/jemalloc_preamble.h.in @@ -87,6 +87,13 @@ static const bool have_madvise_huge = false #endif ; +static const bool have_process_madvise = +#ifdef JEMALLOC_HAVE_PROCESS_MADVISE + true +#else + false +#endif + ; static const bool config_fill = #ifdef JEMALLOC_FILL true diff --git a/include/jemalloc/internal/pages.h b/include/jemalloc/internal/pages.h index 0dcf96dc..366bc30b 100644 --- a/include/jemalloc/internal/pages.h +++ b/include/jemalloc/internal/pages.h @@ -121,6 +121,7 @@ bool pages_commit(void *addr, size_t size); bool pages_decommit(void *addr, size_t size); bool pages_purge_lazy(void *addr, size_t size); bool pages_purge_forced(void *addr, size_t size); +bool pages_purge_process_madvise(void *vec, size_t ven_len, size_t total_bytes); bool pages_huge(void *addr, size_t size); bool pages_nohuge(void *addr, size_t size); bool pages_collapse(void *addr, size_t size); diff --git a/include/jemalloc/internal/typed_list.h b/include/jemalloc/internal/typed_list.h index 6535055a..7c4826fc 100644 --- a/include/jemalloc/internal/typed_list.h +++ b/include/jemalloc/internal/typed_list.h @@ -22,6 +22,10 @@ static inline el_type * \ list_type##_last(const list_type##_t *list) { \ return ql_last(&list->head, linkage); \ } \ +static inline el_type * \ +list_type##_next(const list_type##_t *list, el_type *item) { \ + return ql_next(&list->head, item, linkage); \ +} \ static inline void \ list_type##_append(list_type##_t *list, el_type *item) { \ ql_elm_new(item, linkage); \ diff --git a/src/ctl.c b/src/ctl.c index 73d4cb66..c55d9719 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -169,6 +169,7 @@ CTL_PROTO(opt_prof_time_res) CTL_PROTO(opt_lg_san_uaf_align) CTL_PROTO(opt_zero_realloc) CTL_PROTO(opt_limit_usize_gap) +CTL_PROTO(opt_process_madvise_max_batch) CTL_PROTO(opt_malloc_conf_symlink) CTL_PROTO(opt_malloc_conf_env_var) CTL_PROTO(opt_malloc_conf_global_var) @@ -559,6 +560,7 @@ static const ctl_named_node_t opt_node[] = { {NAME("debug_double_free_max_scan"), CTL(opt_debug_double_free_max_scan)}, {NAME("limit_usize_gap"), CTL(opt_limit_usize_gap)}, + {NAME("process_madvise_max_batch"), CTL(opt_process_madvise_max_batch)}, {NAME("malloc_conf"), CHILD(named, opt_malloc_conf)} }; @@ -2316,6 +2318,8 @@ CTL_RO_NL_GEN(opt_lg_tcache_flush_large_div, opt_lg_tcache_flush_large_div, CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *) CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit, size_t) +CTL_RO_NL_GEN(opt_process_madvise_max_batch, opt_process_madvise_max_batch, + size_t) CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool) CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *) CTL_RO_NL_CGEN(config_prof, opt_prof_active, opt_prof_active, bool) diff --git a/src/extent.c b/src/extent.c index 30942491..e61b7f9c 100644 --- a/src/extent.c +++ b/src/extent.c @@ -12,6 +12,13 @@ /* Data. */ size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT; +size_t opt_process_madvise_max_batch = +#ifdef JEMALLOC_HAVE_PROCESS_MADVISE + PROCESS_MADVISE_MAX_BATCH_DEFAULT; +#else + 0 +#endif + ; static bool extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata, size_t offset, size_t length, bool growing_retained); @@ -1032,6 +1039,29 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, return edata; } +static void +extent_dalloc_wrapper_finish(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + if (config_prof) { + extent_gdump_sub(tsdn, edata); + } + extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata); +} + +void +extent_dalloc_wrapper_purged(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + assert(edata_pai_get(edata) == EXTENT_PAI_PAC); + witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn), + WITNESS_RANK_CORE, 0); + + /* Verify that will not go down the dalloc / munmap route. */ + assert(ehooks_dalloc_will_fail(ehooks)); + + edata_zeroed_set(edata, true); + extent_dalloc_wrapper_finish(tsdn, pac, ehooks, edata); +} + void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata) { @@ -1077,11 +1107,7 @@ extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, } edata_zeroed_set(edata, zeroed); - if (config_prof) { - extent_gdump_sub(tsdn, edata); - } - - extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata); + extent_dalloc_wrapper_finish(tsdn, pac, ehooks, edata); } void diff --git a/src/jemalloc.c b/src/jemalloc.c index 67456bb7..9f4bc785 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1361,6 +1361,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], "muzzy_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) < QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) : SSIZE_MAX); + CONF_HANDLE_SIZE_T(opt_process_madvise_max_batch, + "process_madvise_max_batch", 0, + PROCESS_MADVISE_MAX_BATCH_LIMIT, + CONF_DONT_CHECK_MIN, CONF_CHECK_MAX, + /* clip */ true) CONF_HANDLE_BOOL(opt_stats_print, "stats_print") if (CONF_MATCH("stats_print_opts")) { init_opt_stats_opts(v, vlen, diff --git a/src/pac.c b/src/pac.c index 3523ef3d..12c1e444 100644 --- a/src/pac.c +++ b/src/pac.c @@ -435,6 +435,44 @@ pac_stash_decayed(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache, return nstashed; } +static bool +decay_with_process_madvise(edata_list_inactive_t *decay_extents) { + cassert(have_process_madvise); + assert(opt_process_madvise_max_batch > 0); +#ifndef JEMALLOC_HAVE_PROCESS_MADVISE + return true; +#else + assert(opt_process_madvise_max_batch <= + PROCESS_MADVISE_MAX_BATCH_LIMIT); + size_t len = opt_process_madvise_max_batch; + VARIABLE_ARRAY(struct iovec, vec, len); + + size_t cur = 0, total_bytes = 0; + for (edata_t *edata = edata_list_inactive_first(decay_extents); + edata != NULL; + edata = edata_list_inactive_next(decay_extents, edata)) { + size_t pages_bytes = edata_size_get(edata); + vec[cur].iov_base = edata_base_get(edata); + vec[cur].iov_len = pages_bytes; + total_bytes += pages_bytes; + cur++; + if (cur == len) { + bool err = pages_purge_process_madvise(vec, len, + total_bytes); + if (err) { + return true; + } + cur = 0; + total_bytes = 0; + } + } + if (cur > 0) { + return pages_purge_process_madvise(vec, cur, total_bytes); + } + return false; +#endif +} + static size_t pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay, @@ -450,6 +488,28 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, bool try_muzzy = !fully_decay && pac_decay_ms_get(pac, extent_state_muzzy) != 0; + bool purge_to_retained = !try_muzzy || + ecache->state == extent_state_muzzy; + /* + * Attempt process_madvise only if 1) enabled, 2) purging to retained, + * and 3) not using custom hooks. + */ + bool try_process_madvise = (opt_process_madvise_max_batch > 0) && + purge_to_retained && ehooks_dalloc_will_fail(ehooks); + + bool already_purged; + if (try_process_madvise) { + /* + * If anything unexpected happened during process_madvise + * (e.g. not supporting MADV_DONTNEED, or partial success for + * some reason), we will consider nothing is purged and fallback + * to the regular madvise. + */ + already_purged = !decay_with_process_madvise(decay_extents); + } else { + already_purged = false; + } + for (edata_t *edata = edata_list_inactive_first(decay_extents); edata != NULL; edata = edata_list_inactive_first(decay_extents)) { edata_list_inactive_remove(decay_extents, edata); @@ -473,7 +533,12 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, } JEMALLOC_FALLTHROUGH; case extent_state_muzzy: - extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + if (already_purged) { + extent_dalloc_wrapper_purged(tsdn, pac, ehooks, + edata); + } else { + extent_dalloc_wrapper(tsdn, pac, ehooks, edata); + } nunmapped += npages; break; case extent_state_active: diff --git a/src/pages.c b/src/pages.c index 26fd8d5d..babfd50f 100644 --- a/src/pages.c +++ b/src/pages.c @@ -617,6 +617,58 @@ pages_dodump(void *addr, size_t size) { #endif } +#ifdef JEMALLOC_HAVE_PROCESS_MADVISE +#include +#include +static int pidfd; + +static bool +init_process_madvise(void) { + if (opt_process_madvise_max_batch == 0) { + return false; + } + + if (opt_process_madvise_max_batch > PROCESS_MADVISE_MAX_BATCH_LIMIT) { + opt_process_madvise_max_batch = PROCESS_MADVISE_MAX_BATCH_LIMIT; + } + pid_t pid = getpid(); + pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd == -1) { + return true; + } + + return false; +} + +static bool +pages_purge_process_madvise_impl(void *vec, size_t vec_len, + size_t total_bytes) { + size_t purged_bytes = (size_t)syscall(SYS_process_madvise, pidfd, + (struct iovec *)vec, vec_len, MADV_DONTNEED, 0); + + return purged_bytes != total_bytes; +} + +#else + +static bool +init_process_madvise(void) { + return false; +} + +static bool +pages_purge_process_madvise_impl(void *vec, size_t vec_len, + size_t total_bytes) { + not_reached(); + return true; +} + +#endif + +bool +pages_purge_process_madvise(void *vec, size_t vec_len, size_t total_bytes) { + return pages_purge_process_madvise_impl(vec, vec_len, total_bytes); +} static size_t os_page_detect(void) { @@ -833,6 +885,12 @@ pages_boot(void) { os_pages_unmap(madv_free_page, PAGE); } #endif + if (init_process_madvise()) { + if (opt_abort) { + abort(); + } + return true; + } return false; } diff --git a/src/stats.c b/src/stats.c index b28b9942..58874bf8 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1727,6 +1727,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_INT64("stats_interval") OPT_WRITE_CHAR_P("stats_interval_opts") OPT_WRITE_CHAR_P("zero_realloc") + OPT_WRITE_SIZE_T("process_madvise_max_batch") emitter_dict_end(emitter); /* Close "opt". */ diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index 296b7bff..57aa59e5 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -333,6 +333,7 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(ssize_t, lg_san_uaf_align, uaf_detection); TEST_MALLCTL_OPT(unsigned, debug_double_free_max_scan, always); TEST_MALLCTL_OPT(bool, limit_usize_gap, limit_usize_gap); + TEST_MALLCTL_OPT(size_t, process_madvise_max_batch, always); #undef TEST_MALLCTL_OPT }