diff --git a/include/jemalloc/internal/arena_externs.h b/include/jemalloc/internal/arena_externs.h index e915c97a..8dd5b015 100644 --- a/include/jemalloc/internal/arena_externs.h +++ b/include/jemalloc/internal/arena_externs.h @@ -30,6 +30,9 @@ extern emap_t arena_emap_global; extern size_t opt_oversize_threshold; extern size_t oversize_threshold; +extern bool opt_huge_arena_pac_thp; +extern pac_thp_t huge_arena_pac_thp; + /* * arena_bin_offsets[binind] is the offset of the first bin shard for size class * binind. diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h index 8c6df450..83a37baf 100644 --- a/include/jemalloc/internal/jemalloc_internal_externs.h +++ b/include/jemalloc/internal/jemalloc_internal_externs.h @@ -62,6 +62,8 @@ extern unsigned manual_arena_base; */ extern atomic_p_t arenas[]; +extern unsigned huge_arena_ind; + void *a0malloc(size_t size); void a0dalloc(void *ptr); void *bootstrap_malloc(size_t size); diff --git a/include/jemalloc/internal/pac.h b/include/jemalloc/internal/pac.h index 0b173a58..243e97f3 100644 --- a/include/jemalloc/internal/pac.h +++ b/include/jemalloc/internal/pac.h @@ -125,6 +125,31 @@ struct pac_s { atomic_zu_t extent_sn_next; }; +typedef struct pac_thp_s pac_thp_t; +struct pac_thp_s { + /* + * opt_thp controls THP for user requested allocations. Settings + * "always", "never" and "default" are available if THP is supported + * by the OS and the default extent hooks are used: + * - "always" and "never" are convered by pages_set_thp_state() in + * ehooks_default_alloc_impl(). + * - "default" makes no change for all the other auto arenas except + * the huge arena. For the huge arena, we might also look at + * opt_metadata_thp to decide whether to use THP or not. + * This is a temporary remedy before HPA is fully supported. + */ + bool thp_madvise; + /* Below fields are protected by the lock. */ + malloc_mutex_t lock; + bool auto_thp_switched; + atomic_u_t n_thp_lazy; + /* + * List that tracks HUGEPAGE aligned regions that're lazily hugified + * in auto thp mode. + */ + edata_list_active_t thp_lazy_list; +}; + bool pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, edata_cache_t *edata_cache, nstime_t *cur_time, size_t oversize_threshold, ssize_t dirty_decay_ms, ssize_t muzzy_decay_ms, pac_stats_t *pac_stats, diff --git a/src/arena.c b/src/arena.c index 54ecc403..84d4e14c 100644 --- a/src/arena.c +++ b/src/arena.c @@ -46,7 +46,15 @@ size_t oversize_threshold = OVERSIZE_THRESHOLD_DEFAULT; uint32_t arena_bin_offsets[SC_NBINS]; -static unsigned huge_arena_ind; +/* + * a0 is used to handle huge requests before malloc init completes. After + * that,the huge_arena_ind is updated to point to the actual huge arena, + * which is the last one of the auto arenas. + */ +unsigned huge_arena_ind = 0; +bool opt_huge_arena_pac_thp = false; +pac_thp_t huge_arena_pac_thp = {.thp_madvise = false, + .auto_thp_switched = false, .n_thp_lazy = ATOMIC_INIT(0)}; const arena_config_t arena_config_default = { /* .extent_hooks = */ (extent_hooks_t *)&ehooks_default_extent_hooks, @@ -1898,6 +1906,7 @@ arena_choose_huge(tsd_t *tsd) { bool arena_init_huge(arena_t *a0) { bool huge_enabled; + assert(huge_arena_ind == 0); /* The threshold should be large size class. */ if (opt_oversize_threshold > SC_LARGE_MAXCLASS || @@ -1908,10 +1917,18 @@ arena_init_huge(arena_t *a0) { } else { /* Reserve the index for the huge arena. */ huge_arena_ind = narenas_total_get(); + assert(huge_arena_ind != 0); oversize_threshold = opt_oversize_threshold; /* a0 init happened before malloc_conf_init. */ atomic_store_zu(&a0->pa_shard.pac.oversize_threshold, oversize_threshold, ATOMIC_RELAXED); + /* Initialize huge arena THP settings for PAC. */ + (&huge_arena_pac_thp)->thp_madvise = opt_huge_arena_pac_thp && + metadata_thp_enabled() && (opt_thp == thp_mode_default) && + (init_system_thp_mode == thp_mode_default); + malloc_mutex_init(&(&huge_arena_pac_thp)->lock, "pac_thp", + WITNESS_RANK_LEAF, malloc_mutex_rank_exclusive); + edata_list_active_init(&(&huge_arena_pac_thp)->thp_lazy_list); huge_enabled = true; } diff --git a/src/base.c b/src/base.c index ac8598eb..13367697 100644 --- a/src/base.c +++ b/src/base.c @@ -55,9 +55,6 @@ base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) { } if (ehooks_are_default(ehooks)) { addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit); - if (have_madvise_huge && addr) { - pages_set_thp_state(addr, size); - } } else { addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero, &commit); @@ -153,6 +150,40 @@ base_get_num_blocks(base_t *base, bool with_new_block) { return n_blocks; } +static void +huge_arena_auto_thp_switch(tsdn_t *tsdn, pac_thp_t *pac_thp) { + assert(opt_huge_arena_pac_thp); + assert(!pac_thp->auto_thp_switched); + + arena_t *huge_arena; + if (huge_arena_ind == 0 || (huge_arena = arena_get(tsdn, huge_arena_ind, + false)) == NULL) { + /* Huge arena hasn't been init yet, simply turn the switch on. */ + pac_thp->auto_thp_switched = true; + return; + } + + assert(huge_arena != NULL); + edata_list_active_t *pending_list; + malloc_mutex_lock(tsdn, &pac_thp->lock); + pending_list = &pac_thp->thp_lazy_list; + pac_thp->auto_thp_switched = true; + malloc_mutex_unlock(tsdn, &pac_thp->lock); + + unsigned cnt = 0; + edata_t *edata; + ql_foreach(edata, &pending_list->head, ql_link_active) { + assert(edata != NULL); + void *addr = edata_addr_get(edata); + size_t size = edata_size_get(edata); + assert(HUGEPAGE_ADDR2BASE(addr) == addr); + assert(HUGEPAGE_CEILING(size) == size && size != 0); + pages_huge(addr, size); + cnt++; + } + assert(cnt == atomic_load_u(&pac_thp->n_thp_lazy, ATOMIC_RELAXED)); +} + static void base_auto_thp_switch(tsdn_t *tsdn, base_t *base) { assert(opt_metadata_thp == metadata_thp_auto); @@ -187,6 +218,15 @@ base_auto_thp_switch(tsdn_t *tsdn, base_t *base) { block = block->next; assert(block == NULL || (base_ind_get(base) == 0)); } + + /* Handle the THP auto switch for the huge arena. */ + if (!huge_arena_pac_thp.thp_madvise || base_ind_get(base) != 0) { + /* Only b0 metadata auto thp switch do the trigger. */ + return; + } + malloc_mutex_unlock(tsdn, &base->mtx); + huge_arena_auto_thp_switch(tsdn, &huge_arena_pac_thp); + malloc_mutex_lock(tsdn, &base->mtx); } static void * diff --git a/src/ctl.c b/src/ctl.c index 2c941ae8..1d7eace6 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -113,6 +113,7 @@ CTL_PROTO(opt_hpa_sec_max_alloc) CTL_PROTO(opt_hpa_sec_max_bytes) CTL_PROTO(opt_hpa_sec_bytes_after_flush) CTL_PROTO(opt_hpa_sec_batch_fill_extra) +CTL_PROTO(opt_huge_arena_pac_thp) CTL_PROTO(opt_metadata_thp) CTL_PROTO(opt_retain) CTL_PROTO(opt_dss) @@ -498,6 +499,7 @@ static const ctl_named_node_t opt_node[] = { CTL(opt_hpa_sec_bytes_after_flush)}, {NAME("hpa_sec_batch_fill_extra"), CTL(opt_hpa_sec_batch_fill_extra)}, + {NAME("huge_arena_pac_thp"), CTL(opt_huge_arena_pac_thp)}, {NAME("metadata_thp"), CTL(opt_metadata_thp)}, {NAME("retain"), CTL(opt_retain)}, {NAME("dss"), CTL(opt_dss)}, @@ -2277,6 +2279,7 @@ CTL_RO_NL_GEN(opt_hpa_sec_bytes_after_flush, opt_hpa_sec_opts.bytes_after_flush, CTL_RO_NL_GEN(opt_hpa_sec_batch_fill_extra, opt_hpa_sec_opts.batch_fill_extra, size_t) +CTL_RO_NL_GEN(opt_huge_arena_pac_thp, opt_huge_arena_pac_thp, bool) CTL_RO_NL_GEN(opt_metadata_thp, metadata_thp_mode_names[opt_metadata_thp], const char *) CTL_RO_NL_GEN(opt_retain, opt_retain, bool) diff --git a/src/extent.c b/src/extent.c index e61b7f9c..86b30f82 100644 --- a/src/extent.c +++ b/src/extent.c @@ -646,6 +646,55 @@ extent_recycle(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, return edata; } +static void +extent_handle_huge_arena_thp(tsdn_t *tsdn, pac_thp_t *pac_thp, + edata_cache_t *edata_cache, void *addr, size_t size) { + assert(opt_huge_arena_pac_thp); + assert(opt_metadata_thp != metadata_thp_disabled); + /* + * With rounding up the given memory region [addr, addr + size) to + * the huge page region that it crosses boundaries with, + * essentially we're aligning the start addr down and the end addr + * up to the nearest HUGEPAGE boundaries. The memory overhead can + * be within the range of [0, 2 * (HUGEPAGE - 1)]. + */ + void *huge_addr = HUGEPAGE_ADDR2BASE(addr); + void *huge_end = HUGEPAGE_ADDR2BASE((void *)((byte_t *)addr + + (uintptr_t)(size + HUGEPAGE - 1))); + assert((uintptr_t)huge_end > (uintptr_t)huge_addr); + + size_t huge_size = (uintptr_t)huge_end - (uintptr_t)huge_addr; + assert(huge_size <= (size + ((HUGEPAGE - 1) << 1)) && + huge_size >= size); + + if (opt_metadata_thp == metadata_thp_always || + pac_thp->auto_thp_switched) { + pages_huge(huge_addr, huge_size); + } else { + assert(opt_metadata_thp == metadata_thp_auto); + edata_t *edata = edata_cache_get(tsdn, edata_cache); + + malloc_mutex_lock(tsdn, &pac_thp->lock); + /* Can happen if the switch is turned on during edata retrieval. */ + if (pac_thp->auto_thp_switched) { + malloc_mutex_unlock(tsdn, &pac_thp->lock); + pages_huge(huge_addr, huge_size); + if (edata != NULL) { + edata_cache_put(tsdn, edata_cache, edata); + } + } else { + if (edata != NULL) { + edata_addr_set(edata, huge_addr); + edata_size_set(edata, huge_size); + edata_list_active_append(&pac_thp->thp_lazy_list, edata); + atomic_fetch_add_u(&pac_thp->n_thp_lazy, 1, ATOMIC_RELAXED); + } + malloc_mutex_unlock(tsdn, &pac_thp->lock); + } + malloc_mutex_assert_not_owner(tsdn, &pac_thp->lock); + } +} + /* * If virtual memory is retained, create increasingly larger extents from which * to split requested extents in order to limit the total number of disjoint @@ -688,10 +737,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, goto label_err; } - edata_init(edata, ecache_ind_get(&pac->ecache_retained), ptr, - alloc_size, false, SC_NSIZES, extent_sn_next(pac), - extent_state_active, zeroed, committed, EXTENT_PAI_PAC, - EXTENT_IS_HEAD); + unsigned ind = ecache_ind_get(&pac->ecache_retained); + edata_init(edata, ind, ptr, alloc_size, false, SC_NSIZES, + extent_sn_next(pac), extent_state_active, zeroed, committed, + EXTENT_PAI_PAC, EXTENT_IS_HEAD); if (extent_register_no_gdump_add(tsdn, pac, edata)) { edata_cache_put(tsdn, pac->edata_cache, edata); @@ -767,6 +816,15 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, exp_grow_size_commit(&pac->exp_grow, exp_grow_skip); malloc_mutex_unlock(tsdn, &pac->grow_mtx); + if (huge_arena_pac_thp.thp_madvise) { + /* Avoid using HUGEPAGE when the grow size is less than HUGEPAGE. */ + if (ind != 0 && ind == huge_arena_ind && ehooks_are_default(ehooks) && + likely(alloc_size >= HUGEPAGE)) { + extent_handle_huge_arena_thp(tsdn, &huge_arena_pac_thp, + pac->edata_cache, ptr, alloc_size); + } + } + if (config_prof) { /* Adjust gdump stats now that extent is final size. */ extent_gdump_add(tsdn, edata); diff --git a/src/jemalloc.c b/src/jemalloc.c index d08771f8..4939d954 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1240,6 +1240,7 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], CONF_HANDLE_BOOL(opt_abort_conf, "abort_conf") CONF_HANDLE_BOOL(opt_cache_oblivious, "cache_oblivious") CONF_HANDLE_BOOL(opt_trust_madvise, "trust_madvise") + CONF_HANDLE_BOOL(opt_huge_arena_pac_thp, "huge_arena_pac_thp") if (strncmp("metadata_thp", k, klen) == 0) { int m; bool match = false; diff --git a/src/stats.c b/src/stats.c index bd0167fb..6e77977f 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1681,6 +1681,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_SIZE_T("hpa_sec_max_bytes") OPT_WRITE_SIZE_T("hpa_sec_bytes_after_flush") OPT_WRITE_SIZE_T("hpa_sec_batch_fill_extra") + OPT_WRITE_BOOL("huge_arena_pac_thp") OPT_WRITE_CHAR_P("metadata_thp") OPT_WRITE_INT64("mutex_max_spin") OPT_WRITE_BOOL_MUTABLE("background_thread", "background_thread")