Implement process_madvise support.

Add opt.process_madvise_max_batch which determines if process_madvise is enabled
(non-zero) and the max # of regions in each batch.  Added another limiting
factor which is the space to reserve on stack, which results in the max batch of
128.
This commit is contained in:
Qi Wang 2025-02-04 18:31:11 -08:00
parent 70f019cd3a
commit c3604456d4
13 changed files with 204 additions and 6 deletions

View file

@ -2544,6 +2544,17 @@ if test "x${je_cv_madvise}" = "xyes" ; then
if test "x${je_cv_madv_collapse}" = "xyes" ; then
AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ])
fi
dnl Check for process_madvise
JE_COMPILABLE([process_madvise(2)], [
#include <sys/mman.h>
#include <sys/syscall.h>
], [
syscall(SYS_process_madvise, 0, (void *)0, 0, 0, 0);
], [je_cv_process_madvise])
if test "x${je_cv_process_madvise}" = "xyes" ; then
AC_DEFINE([JEMALLOC_HAVE_PROCESS_MADVISE], [ ], [ ])
fi
else
dnl Check for posix_madvise.
JE_COMPILABLE([posix_madvise], [

View file

@ -21,6 +21,16 @@
#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
extern size_t opt_lg_extent_max_active_fit;
#define PROCESS_MADVISE_MAX_BATCH_DEFAULT 0
extern size_t opt_process_madvise_max_batch;
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
/* The iovec is on stack. Limit the max batch to avoid stack overflow. */
#define PROCESS_MADVISE_MAX_BATCH_LIMIT (VARIABLE_ARRAY_SIZE_MAX / sizeof(struct iovec))
#else
#define PROCESS_MADVISE_MAX_BATCH_LIMIT 0
#endif
edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
bool zero, bool guarded);
@ -42,6 +52,8 @@ edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
bool growing_retained);
void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata);
void extent_dalloc_wrapper_purged(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata);
void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata);
bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,

View file

@ -345,6 +345,9 @@
*/
#undef JEMALLOC_MADVISE_NOCORE
/* Defined if process_madvise(2) is available. */
#undef JEMALLOC_HAVE_PROCESS_MADVISE
/* Defined if mprotect(2) is available. */
#undef JEMALLOC_HAVE_MPROTECT

View file

@ -87,6 +87,13 @@ static const bool have_madvise_huge =
false
#endif
;
static const bool have_process_madvise =
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
true
#else
false
#endif
;
static const bool config_fill =
#ifdef JEMALLOC_FILL
true

View file

@ -121,6 +121,7 @@ bool pages_commit(void *addr, size_t size);
bool pages_decommit(void *addr, size_t size);
bool pages_purge_lazy(void *addr, size_t size);
bool pages_purge_forced(void *addr, size_t size);
bool pages_purge_process_madvise(void *vec, size_t ven_len, size_t total_bytes);
bool pages_huge(void *addr, size_t size);
bool pages_nohuge(void *addr, size_t size);
bool pages_collapse(void *addr, size_t size);

View file

@ -22,6 +22,10 @@ static inline el_type * \
list_type##_last(const list_type##_t *list) { \
return ql_last(&list->head, linkage); \
} \
static inline el_type * \
list_type##_next(const list_type##_t *list, el_type *item) { \
return ql_next(&list->head, item, linkage); \
} \
static inline void \
list_type##_append(list_type##_t *list, el_type *item) { \
ql_elm_new(item, linkage); \

View file

@ -169,6 +169,7 @@ CTL_PROTO(opt_prof_time_res)
CTL_PROTO(opt_lg_san_uaf_align)
CTL_PROTO(opt_zero_realloc)
CTL_PROTO(opt_limit_usize_gap)
CTL_PROTO(opt_process_madvise_max_batch)
CTL_PROTO(opt_malloc_conf_symlink)
CTL_PROTO(opt_malloc_conf_env_var)
CTL_PROTO(opt_malloc_conf_global_var)
@ -559,6 +560,7 @@ static const ctl_named_node_t opt_node[] = {
{NAME("debug_double_free_max_scan"),
CTL(opt_debug_double_free_max_scan)},
{NAME("limit_usize_gap"), CTL(opt_limit_usize_gap)},
{NAME("process_madvise_max_batch"), CTL(opt_process_madvise_max_batch)},
{NAME("malloc_conf"), CHILD(named, opt_malloc_conf)}
};
@ -2316,6 +2318,8 @@ CTL_RO_NL_GEN(opt_lg_tcache_flush_large_div, opt_lg_tcache_flush_large_div,
CTL_RO_NL_GEN(opt_thp, thp_mode_names[opt_thp], const char *)
CTL_RO_NL_GEN(opt_lg_extent_max_active_fit, opt_lg_extent_max_active_fit,
size_t)
CTL_RO_NL_GEN(opt_process_madvise_max_batch, opt_process_madvise_max_batch,
size_t)
CTL_RO_NL_CGEN(config_prof, opt_prof, opt_prof, bool)
CTL_RO_NL_CGEN(config_prof, opt_prof_prefix, opt_prof_prefix, const char *)
CTL_RO_NL_CGEN(config_prof, opt_prof_active, opt_prof_active, bool)

View file

@ -12,6 +12,13 @@
/* Data. */
size_t opt_lg_extent_max_active_fit = LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT;
size_t opt_process_madvise_max_batch =
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
PROCESS_MADVISE_MAX_BATCH_DEFAULT;
#else
0
#endif
;
static bool extent_commit_impl(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
size_t offset, size_t length, bool growing_retained);
@ -1032,6 +1039,29 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
return edata;
}
static void
extent_dalloc_wrapper_finish(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata) {
if (config_prof) {
extent_gdump_sub(tsdn, edata);
}
extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata);
}
void
extent_dalloc_wrapper_purged(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata) {
assert(edata_pai_get(edata) == EXTENT_PAI_PAC);
witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
WITNESS_RANK_CORE, 0);
/* Verify that will not go down the dalloc / munmap route. */
assert(ehooks_dalloc_will_fail(ehooks));
edata_zeroed_set(edata, true);
extent_dalloc_wrapper_finish(tsdn, pac, ehooks, edata);
}
void
extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata) {
@ -1077,11 +1107,7 @@ extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
}
edata_zeroed_set(edata, zeroed);
if (config_prof) {
extent_gdump_sub(tsdn, edata);
}
extent_record(tsdn, pac, ehooks, &pac->ecache_retained, edata);
extent_dalloc_wrapper_finish(tsdn, pac, ehooks, edata);
}
void

View file

@ -1361,6 +1361,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
"muzzy_decay_ms", -1, NSTIME_SEC_MAX * KQU(1000) <
QU(SSIZE_MAX) ? NSTIME_SEC_MAX * KQU(1000) :
SSIZE_MAX);
CONF_HANDLE_SIZE_T(opt_process_madvise_max_batch,
"process_madvise_max_batch", 0,
PROCESS_MADVISE_MAX_BATCH_LIMIT,
CONF_DONT_CHECK_MIN, CONF_CHECK_MAX,
/* clip */ true)
CONF_HANDLE_BOOL(opt_stats_print, "stats_print")
if (CONF_MATCH("stats_print_opts")) {
init_opt_stats_opts(v, vlen,

View file

@ -435,6 +435,44 @@ pac_stash_decayed(tsdn_t *tsdn, pac_t *pac, ecache_t *ecache,
return nstashed;
}
static bool
decay_with_process_madvise(edata_list_inactive_t *decay_extents) {
cassert(have_process_madvise);
assert(opt_process_madvise_max_batch > 0);
#ifndef JEMALLOC_HAVE_PROCESS_MADVISE
return true;
#else
assert(opt_process_madvise_max_batch <=
PROCESS_MADVISE_MAX_BATCH_LIMIT);
size_t len = opt_process_madvise_max_batch;
VARIABLE_ARRAY(struct iovec, vec, len);
size_t cur = 0, total_bytes = 0;
for (edata_t *edata = edata_list_inactive_first(decay_extents);
edata != NULL;
edata = edata_list_inactive_next(decay_extents, edata)) {
size_t pages_bytes = edata_size_get(edata);
vec[cur].iov_base = edata_base_get(edata);
vec[cur].iov_len = pages_bytes;
total_bytes += pages_bytes;
cur++;
if (cur == len) {
bool err = pages_purge_process_madvise(vec, len,
total_bytes);
if (err) {
return true;
}
cur = 0;
total_bytes = 0;
}
}
if (cur > 0) {
return pages_purge_process_madvise(vec, cur, total_bytes);
}
return false;
#endif
}
static size_t
pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
pac_decay_stats_t *decay_stats, ecache_t *ecache, bool fully_decay,
@ -450,6 +488,28 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
bool try_muzzy = !fully_decay
&& pac_decay_ms_get(pac, extent_state_muzzy) != 0;
bool purge_to_retained = !try_muzzy ||
ecache->state == extent_state_muzzy;
/*
* Attempt process_madvise only if 1) enabled, 2) purging to retained,
* and 3) not using custom hooks.
*/
bool try_process_madvise = (opt_process_madvise_max_batch > 0) &&
purge_to_retained && ehooks_dalloc_will_fail(ehooks);
bool already_purged;
if (try_process_madvise) {
/*
* If anything unexpected happened during process_madvise
* (e.g. not supporting MADV_DONTNEED, or partial success for
* some reason), we will consider nothing is purged and fallback
* to the regular madvise.
*/
already_purged = !decay_with_process_madvise(decay_extents);
} else {
already_purged = false;
}
for (edata_t *edata = edata_list_inactive_first(decay_extents); edata !=
NULL; edata = edata_list_inactive_first(decay_extents)) {
edata_list_inactive_remove(decay_extents, edata);
@ -473,7 +533,12 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
}
JEMALLOC_FALLTHROUGH;
case extent_state_muzzy:
extent_dalloc_wrapper(tsdn, pac, ehooks, edata);
if (already_purged) {
extent_dalloc_wrapper_purged(tsdn, pac, ehooks,
edata);
} else {
extent_dalloc_wrapper(tsdn, pac, ehooks, edata);
}
nunmapped += npages;
break;
case extent_state_active:

View file

@ -617,6 +617,58 @@ pages_dodump(void *addr, size_t size) {
#endif
}
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
#include <sys/mman.h>
#include <sys/syscall.h>
static int pidfd;
static bool
init_process_madvise(void) {
if (opt_process_madvise_max_batch == 0) {
return false;
}
if (opt_process_madvise_max_batch > PROCESS_MADVISE_MAX_BATCH_LIMIT) {
opt_process_madvise_max_batch = PROCESS_MADVISE_MAX_BATCH_LIMIT;
}
pid_t pid = getpid();
pidfd = syscall(SYS_pidfd_open, pid, 0);
if (pidfd == -1) {
return true;
}
return false;
}
static bool
pages_purge_process_madvise_impl(void *vec, size_t vec_len,
size_t total_bytes) {
size_t purged_bytes = (size_t)syscall(SYS_process_madvise, pidfd,
(struct iovec *)vec, vec_len, MADV_DONTNEED, 0);
return purged_bytes != total_bytes;
}
#else
static bool
init_process_madvise(void) {
return false;
}
static bool
pages_purge_process_madvise_impl(void *vec, size_t vec_len,
size_t total_bytes) {
not_reached();
return true;
}
#endif
bool
pages_purge_process_madvise(void *vec, size_t vec_len, size_t total_bytes) {
return pages_purge_process_madvise_impl(vec, vec_len, total_bytes);
}
static size_t
os_page_detect(void) {
@ -833,6 +885,12 @@ pages_boot(void) {
os_pages_unmap(madv_free_page, PAGE);
}
#endif
if (init_process_madvise()) {
if (opt_abort) {
abort();
}
return true;
}
return false;
}

View file

@ -1727,6 +1727,7 @@ stats_general_print(emitter_t *emitter) {
OPT_WRITE_INT64("stats_interval")
OPT_WRITE_CHAR_P("stats_interval_opts")
OPT_WRITE_CHAR_P("zero_realloc")
OPT_WRITE_SIZE_T("process_madvise_max_batch")
emitter_dict_end(emitter); /* Close "opt". */

View file

@ -333,6 +333,7 @@ TEST_BEGIN(test_mallctl_opt) {
TEST_MALLCTL_OPT(ssize_t, lg_san_uaf_align, uaf_detection);
TEST_MALLCTL_OPT(unsigned, debug_double_free_max_scan, always);
TEST_MALLCTL_OPT(bool, limit_usize_gap, limit_usize_gap);
TEST_MALLCTL_OPT(size_t, process_madvise_max_batch, always);
#undef TEST_MALLCTL_OPT
}