Support THP with Huge Arena in PAC

This commit is contained in:
Shirui Cheng 2025-03-05 13:00:57 -08:00
parent 86bbabac32
commit 86edaaa929
9 changed files with 158 additions and 8 deletions

View file

@ -30,6 +30,9 @@ extern emap_t arena_emap_global;
extern size_t opt_oversize_threshold;
extern size_t oversize_threshold;
extern bool opt_huge_arena_pac_thp;
extern pac_thp_t huge_arena_pac_thp;
/*
* arena_bin_offsets[binind] is the offset of the first bin shard for size class
* binind.

View file

@ -62,6 +62,8 @@ extern unsigned manual_arena_base;
*/
extern atomic_p_t arenas[];
extern unsigned huge_arena_ind;
void *a0malloc(size_t size);
void a0dalloc(void *ptr);
void *bootstrap_malloc(size_t size);

View file

@ -125,6 +125,31 @@ struct pac_s {
atomic_zu_t extent_sn_next;
};
typedef struct pac_thp_s pac_thp_t;
struct pac_thp_s {
/*
* opt_thp controls THP for user requested allocations. Settings
* "always", "never" and "default" are available if THP is supported
* by the OS and the default extent hooks are used:
* - "always" and "never" are convered by pages_set_thp_state() in
* ehooks_default_alloc_impl().
* - "default" makes no change for all the other auto arenas except
* the huge arena. For the huge arena, we might also look at
* opt_metadata_thp to decide whether to use THP or not.
* This is a temporary remedy before HPA is fully supported.
*/
bool thp_madvise;
/* Below fields are protected by the lock. */
malloc_mutex_t lock;
bool auto_thp_switched;
atomic_u_t n_thp_lazy;
/*
* List that tracks HUGEPAGE aligned regions that're lazily hugified
* in auto thp mode.
*/
edata_list_active_t thp_lazy_list;
};
bool pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap,
edata_cache_t *edata_cache, nstime_t *cur_time, size_t oversize_threshold,
ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms, pac_stats_t *pac_stats,

View file

@ -46,7 +46,15 @@ size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT;
uint32_t arena_bin_offsets[SC_NBINS];
static unsigned huge_arena_ind;
/*
* a0 is used to handle huge requests before malloc init completes. After
* that,the huge_arena_ind is updated to point to the actual huge arena,
* which is the last one of the auto arenas.
*/
unsigned huge_arena_ind = 0;
bool opt_huge_arena_pac_thp = false;
pac_thp_t huge_arena_pac_thp = {.thp_madvise = false,
.auto_thp_switched = false, .n_thp_lazy = ATOMIC_INIT(0)};
const arena_config_t arena_config_default = {
/* .extent_hooks = */ (extent_hooks_t *)&ehooks_default_extent_hooks,
@ -1898,6 +1906,7 @@ arena_choose_huge(tsd_t *tsd) {
bool
arena_init_huge(arena_t *a0) {
bool huge_enabled;
assert(huge_arena_ind == 0);
/* The threshold should be large size class. */
if (opt_oversize_threshold > SC_LARGE_MAXCLASS ||
@ -1908,10 +1917,18 @@ arena_init_huge(arena_t *a0) {
} else {
/* Reserve the index for the huge arena. */
huge_arena_ind = narenas_total_get();
assert(huge_arena_ind != 0);
oversize_threshold = opt_oversize_threshold;
/* a0 init happened before malloc_conf_init. */
atomic_store_zu(&a0->pa_shard.pac.oversize_threshold,
oversize_threshold, ATOMIC_RELAXED);
/* Initialize huge arena THP settings for PAC. */
(&huge_arena_pac_thp)->thp_madvise = opt_huge_arena_pac_thp &&
metadata_thp_enabled() && (opt_thp == thp_mode_default) &&
(init_system_thp_mode == thp_mode_default);
malloc_mutex_init(&(&huge_arena_pac_thp)->lock, "pac_thp",
WITNESS_RANK_LEAF, malloc_mutex_rank_exclusive);
edata_list_active_init(&(&huge_arena_pac_thp)->thp_lazy_list);
huge_enabled = true;
}

View file

@ -55,9 +55,6 @@ base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) {
}
if (ehooks_are_default(ehooks)) {
addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
if (have_madvise_huge && addr) {
pages_set_thp_state(addr, size);
}
} else {
addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero,
&commit);
@ -153,6 +150,40 @@ base_get_num_blocks(base_t *base, bool with_new_block) {
return n_blocks;
}
static void
huge_arena_auto_thp_switch(tsdn_t *tsdn, pac_thp_t *pac_thp) {
assert(opt_huge_arena_pac_thp);
assert(!pac_thp->auto_thp_switched);
arena_t *huge_arena;
if (huge_arena_ind == 0 || (huge_arena = arena_get(tsdn, huge_arena_ind,
false)) == NULL) {
/* Huge arena hasn't been init yet, simply turn the switch on. */
pac_thp->auto_thp_switched = true;
return;
}
assert(huge_arena != NULL);
edata_list_active_t *pending_list;
malloc_mutex_lock(tsdn, &pac_thp->lock);
pending_list = &pac_thp->thp_lazy_list;
pac_thp->auto_thp_switched = true;
malloc_mutex_unlock(tsdn, &pac_thp->lock);
unsigned cnt = 0;
edata_t *edata;
ql_foreach(edata, &pending_list->head, ql_link_active) {
assert(edata != NULL);
void *addr = edata_addr_get(edata);
size_t size = edata_size_get(edata);
assert(HUGEPAGE_ADDR2BASE(addr) == addr);
assert(HUGEPAGE_CEILING(size) == size && size != 0);
pages_huge(addr, size);
cnt++;
}
assert(cnt == atomic_load_u(&pac_thp->n_thp_lazy, ATOMIC_RELAXED));
}
static void
base_auto_thp_switch(tsdn_t *tsdn, base_t *base) {
assert(opt_metadata_thp == metadata_thp_auto);
@ -187,6 +218,15 @@ base_auto_thp_switch(tsdn_t *tsdn, base_t *base) {
block = block->next;
assert(block == NULL || (base_ind_get(base) == 0));
}
/* Handle the THP auto switch for the huge arena. */
if (!huge_arena_pac_thp.thp_madvise || base_ind_get(base) != 0) {
/* Only b0 metadata auto thp switch do the trigger. */
return;
}
malloc_mutex_unlock(tsdn, &base->mtx);
huge_arena_auto_thp_switch(tsdn, &huge_arena_pac_thp);
malloc_mutex_lock(tsdn, &base->mtx);
}
static void *

View file

@ -113,6 +113,7 @@ CTL_PROTO(opt_hpa_sec_max_alloc)
CTL_PROTO(opt_hpa_sec_max_bytes)
CTL_PROTO(opt_hpa_sec_bytes_after_flush)
CTL_PROTO(opt_hpa_sec_batch_fill_extra)
CTL_PROTO(opt_huge_arena_pac_thp)
CTL_PROTO(opt_metadata_thp)
CTL_PROTO(opt_retain)
CTL_PROTO(opt_dss)
@ -498,6 +499,7 @@ static const ctl_named_node_t opt_node[] = {
CTL(opt_hpa_sec_bytes_after_flush)},
{NAME("hpa_sec_batch_fill_extra"),
CTL(opt_hpa_sec_batch_fill_extra)},
{NAME("huge_arena_pac_thp"), CTL(opt_huge_arena_pac_thp)},
{NAME("metadata_thp"), CTL(opt_metadata_thp)},
{NAME("retain"), CTL(opt_retain)},
{NAME("dss"), CTL(opt_dss)},
@ -2277,6 +2279,7 @@ CTL_RO_NL_GEN(opt_hpa_sec_bytes_after_flush, opt_hpa_sec_opts.bytes_after_flush,
CTL_RO_NL_GEN(opt_hpa_sec_batch_fill_extra, opt_hpa_sec_opts.batch_fill_extra,
size_t)
CTL_RO_NL_GEN(opt_huge_arena_pac_thp, opt_huge_arena_pac_thp, bool)
CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp],
const char *)
CTL_RO_NL_GEN(opt_retain, opt_retain, bool)

View file

@ -646,6 +646,55 @@ extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
return edata;
}
static void
extent_handle_huge_arena_thp(tsdn_t *tsdn, pac_thp_t *pac_thp,
edata_cache_t *edata_cache, void *addr, size_t size) {
assert(opt_huge_arena_pac_thp);
assert(opt_metadata_thp != metadata_thp_disabled);
/*
* With rounding up the given memory region [addr, addr + size) to
* the huge page region that it crosses boundaries with,
* essentially we're aligning the start addr down and the end addr
* up to the nearest HUGEPAGE boundaries. The memory overhead can
* be within the range of [0, 2 * (HUGEPAGE - 1)].
*/
void *huge_addr = HUGEPAGE_ADDR2BASE(addr);
void *huge_end = HUGEPAGE_ADDR2BASE((void *)((byte_t *)addr +
(uintptr_t)(size + HUGEPAGE - 1)));
assert((uintptr_t)huge_end > (uintptr_t)huge_addr);
size_t huge_size = (uintptr_t)huge_end - (uintptr_t)huge_addr;
assert(huge_size <= (size + ((HUGEPAGE - 1) << 1)) &&
huge_size >= size);
if (opt_metadata_thp == metadata_thp_always ||
pac_thp->auto_thp_switched) {
pages_huge(huge_addr, huge_size);
} else {
assert(opt_metadata_thp == metadata_thp_auto);
edata_t *edata = edata_cache_get(tsdn, edata_cache);
malloc_mutex_lock(tsdn, &pac_thp->lock);
/* Can happen if the switch is turned on during edata retrieval. */
if (pac_thp->auto_thp_switched) {
malloc_mutex_unlock(tsdn, &pac_thp->lock);
pages_huge(huge_addr, huge_size);
if (edata != NULL) {
edata_cache_put(tsdn, edata_cache, edata);
}
} else {
if (edata != NULL) {
edata_addr_set(edata, huge_addr);
edata_size_set(edata, huge_size);
edata_list_active_append(&pac_thp->thp_lazy_list, edata);
atomic_fetch_add_u(&pac_thp->n_thp_lazy, 1, ATOMIC_RELAXED);
}
malloc_mutex_unlock(tsdn, &pac_thp->lock);
}
malloc_mutex_assert_not_owner(tsdn, &pac_thp->lock);
}
}
/*
* If virtual memory is retained, create increasingly larger extents from which
* to split requested extents in order to limit the total number of disjoint
@ -688,10 +737,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
goto label_err;
}
edata_init(edata, ecache_ind_get(&pac->ecache_retained), ptr,
alloc_size, false, SC_NSIZES, extent_sn_next(pac),
extent_state_active, zeroed, committed, EXTENT_PAI_PAC,
EXTENT_IS_HEAD);
unsigned ind = ecache_ind_get(&pac->ecache_retained);
edata_init(edata, ind, ptr, alloc_size, false, SC_NSIZES,
extent_sn_next(pac), extent_state_active, zeroed, committed,
EXTENT_PAI_PAC, EXTENT_IS_HEAD);
if (extent_register_no_gdump_add(tsdn, pac, edata)) {
edata_cache_put(tsdn, pac->edata_cache, edata);
@ -767,6 +816,15 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
exp_grow_size_commit(&pac->exp_grow, exp_grow_skip);
malloc_mutex_unlock(tsdn, &pac->grow_mtx);
if (huge_arena_pac_thp.thp_madvise) {
/* Avoid using HUGEPAGE when the grow size is less than HUGEPAGE. */
if (ind != 0 && ind == huge_arena_ind && ehooks_are_default(ehooks) &&
likely(alloc_size >= HUGEPAGE)) {
extent_handle_huge_arena_thp(tsdn, &huge_arena_pac_thp,
pac->edata_cache, ptr, alloc_size);
}
}
if (config_prof) {
/* Adjust gdump stats now that extent is final size. */
extent_gdump_add(tsdn, edata);

View file

@ -1240,6 +1240,7 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf")
CONF_HANDLE_BOOL(opt_cache_oblivious, "cache_oblivious")
CONF_HANDLE_BOOL(opt_trust_madvise, "trust_madvise")
CONF_HANDLE_BOOL(opt_huge_arena_pac_thp, "huge_arena_pac_thp")
if (strncmp("metadata_thp", k, klen) == 0) {
int m;
bool match = false;

View file

@ -1681,6 +1681,7 @@ stats_general_print(emitter_t *emitter) {
OPT_WRITE_SIZE_T("hpa_sec_max_bytes")
OPT_WRITE_SIZE_T("hpa_sec_bytes_after_flush")
OPT_WRITE_SIZE_T("hpa_sec_batch_fill_extra")
OPT_WRITE_BOOL("huge_arena_pac_thp")
OPT_WRITE_CHAR_P("metadata_thp")
OPT_WRITE_INT64("mutex_max_spin")
OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")