#include "jemalloc/internal/jemalloc_preamble.h" #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/hpa.h" #include "jemalloc/internal/hpa_utils.h" #include "jemalloc/internal/fb.h" #include "jemalloc/internal/witness.h" #include "jemalloc/internal/jemalloc_probe.h" static void hpa_dalloc_batch(tsdn_t *tsdn, hpa_shard_t *shard, edata_list_active_t *list, bool *deferred_work_generated); const char *const hpa_hugify_style_names[] = {"auto", "none", "eager", "lazy"}; bool opt_experimental_hpa_start_huge_if_thp_always = true; bool opt_experimental_hpa_enforce_hugify = false; bool hpa_hugepage_size_exceeds_limit(void) { return HUGEPAGE > HUGEPAGE_MAX_EXPECTED_SIZE; } bool hpa_supported(void) { #ifdef _WIN32 /* * At least until the API and implementation is somewhat settled, we * don't want to try to debug the VM subsystem on the hardest-to-test * platform. */ return false; #endif if (!pages_can_hugify) { return false; } /* * We fundamentally rely on an address-space-hungry growth strategy for * hugepages. */ if (LG_SIZEOF_PTR != 3) { return false; } /* * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes * this sentinel value -- see the comment in pages.h. */ if (HUGEPAGE_PAGES == 1) { return false; } /* As mentioned in pages.h, do not support If HUGEPAGE is too large. */ if (hpa_hugepage_size_exceeds_limit()) { return false; } return true; } static void hpa_do_consistency_checks(hpa_shard_t *shard) { assert(shard->base != NULL); } bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central, emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind, const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts) { /* malloc_conf processing should have filtered out these cases. */ assert(hpa_supported()); bool err; err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow", WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive); if (err) { return true; } err = malloc_mutex_init(&shard->mtx, "hpa_shard", WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive); if (err) { return true; } assert(edata_cache != NULL); shard->central = central; shard->base = base; edata_cache_fast_init(&shard->ecf, edata_cache); psset_init(&shard->psset); shard->age_counter = 0; shard->ind = ind; shard->emap = emap; shard->opts = *opts; shard->npending_purge = 0; nstime_init_zero(&shard->last_purge); nstime_init_zero(&shard->last_time_work_attempted); shard->stats.npurge_passes = 0; shard->stats.npurges = 0; shard->stats.nhugifies = 0; shard->stats.nhugify_failures = 0; shard->stats.ndehugifies = 0; err = sec_init(tsdn, &shard->sec, base, sec_opts); if (err) { return true; } hpa_do_consistency_checks(shard); return false; } /* * Note that the stats functions here follow the usual stats naming conventions; * "merge" obtains the stats from some live object of instance, while "accum" * only combines the stats from one stats object to another. Hence the lack of * locking here. */ static void hpa_shard_nonderived_stats_accum( hpa_shard_nonderived_stats_t *dst, hpa_shard_nonderived_stats_t *src) { dst->npurge_passes += src->npurge_passes; dst->npurges += src->npurges; dst->nhugifies += src->nhugifies; dst->nhugify_failures += src->nhugify_failures; dst->ndehugifies += src->ndehugifies; } void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) { psset_stats_accum(&dst->psset_stats, &src->psset_stats); hpa_shard_nonderived_stats_accum( &dst->nonderived_stats, &src->nonderived_stats); sec_stats_accum(&dst->secstats, &src->secstats); } void hpa_shard_stats_merge( tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst) { hpa_do_consistency_checks(shard); malloc_mutex_lock(tsdn, &shard->grow_mtx); malloc_mutex_lock(tsdn, &shard->mtx); psset_stats_accum(&dst->psset_stats, &shard->psset.stats); hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats); malloc_mutex_unlock(tsdn, &shard->mtx); malloc_mutex_unlock(tsdn, &shard->grow_mtx); sec_stats_merge(tsdn, &shard->sec, &dst->secstats); } static bool hpa_is_hugify_eager(hpa_shard_t *shard) { return shard->opts.hugify_style == hpa_hugify_style_eager; } static bool hpa_is_hugify_lazy(hpa_shard_t *shard) { /* When hugify_sync==true we also set/unset HG bit manually */ return shard->opts.hugify_style == hpa_hugify_style_lazy || shard->opts.hugify_sync; } static bool hpa_is_hugify_none(hpa_shard_t *shard) { return shard->opts.hugify_style == hpa_hugify_style_none; } /* * Experimentation has shown that when we are purging only HUGEPAGE ranges and * hugifying eagerly (or thp enabled=always) we get huge pages more often. This * helps us have more realistic accounting. */ static bool hpa_should_assume_huge(hpa_shard_t *shard, const hpdata_t *ps) { return (hpa_is_hugify_eager(shard) || hpa_is_hugify_none(shard)) && hpdata_purged_when_empty_and_huge_get(ps); } static bool hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) { /* * Note that this needs to be >= rather than just >, because of the * important special case in which the hugification threshold is exactly * HUGEPAGE. */ return hpdata_nactive_get(ps) * PAGE >= shard->opts.hugification_threshold; } static bool hpa_good_purge_candidate(hpa_shard_t *shard, hpdata_t *ps) { if (shard->opts.dirty_mult == (fxp_t)-1) { /* No purging. */ return false; } size_t ndirty = hpdata_ndirty_get(ps); /* Empty pages are good candidate for purging. */ if (ndirty > 0 && hpdata_empty(ps)) { return true; } return ndirty * PAGE >= shard->opts.purge_threshold; } static size_t hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); return psset_ndirty(&shard->psset) - shard->npending_purge; } static size_t hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); if (shard->opts.dirty_mult == (fxp_t)-1) { return (size_t)-1; } return fxp_mul_frac( psset_nactive(&shard->psset), shard->opts.dirty_mult); } static bool hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); if (to_hugify == NULL) { return false; } return hpa_adjusted_ndirty(tsdn, shard) + hpdata_nretained_get(to_hugify) > hpa_ndirty_max(tsdn, shard); } static bool hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); /* * The page that is purgable may be delayed, but we just want to know * if there is a need for bg thread to wake up in the future. */ hpdata_t *ps = psset_pick_purge(&shard->psset, NULL); if (ps == NULL) { return false; } if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) { return true; } if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { return true; } return false; } static void hpa_assume_huge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { malloc_mutex_assert_owner(tsdn, &shard->mtx); assert(hpa_should_assume_huge(shard, ps)); if (hpdata_huge_get(ps) || hpdata_empty(ps)) { return; } if (hpdata_ntouched_get(ps) != HUGEPAGE_PAGES) { hpdata_hugify(ps); } } static void hpa_update_purge_hugify_eligibility( tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) { malloc_mutex_assert_owner(tsdn, &shard->mtx); if (hpdata_changing_state_get(ps)) { hpdata_purge_allowed_set(ps, false); hpdata_disallow_hugify(ps); return; } /* * Hugepages are distinctly costly to purge, so try to avoid it unless * they're *particularly* full of dirty pages. Eventually, we should * use a smarter / more dynamic heuristic for situations where we have * to manually hugify. * * In situations where we don't manually hugify, this problem is * reduced. The "bad" situation we're trying to avoid is one's that's * common in some Linux configurations (where both enabled and defrag * are set to madvise) that can lead to long latency spikes on the first * access after a hugification. The ideal policy in such configurations * is probably time-based for both purging and hugifying; only hugify a * hugepage if it's met the criteria for some extended period of time, * and only dehugify it if it's failed to meet the criteria for an * extended period of time. When background threads are on, we should * try to take this hit on one of them, as well. * * I think the ideal setting is THP always enabled, and defrag set to * deferred; in that case we don't need any explicit calls on the * allocator's end at all; we just try to pack allocations in a * hugepage-friendly manner and let the OS hugify in the background. */ if (hpa_should_assume_huge(shard, ps)) { /* Assume it is huge without the need to madvise */ hpa_assume_huge(tsdn, shard, ps); } if ((hpa_is_hugify_lazy(shard) || opt_experimental_hpa_enforce_hugify) && hpa_good_hugification_candidate(shard, ps) && !hpdata_huge_get(ps)) { nstime_t now; shard->central->hooks.curtime(&now, /* first_reading */ true); hpdata_allow_hugify(ps, now); } bool purgable = hpa_good_purge_candidate(shard, ps); if (purgable && !hpdata_purge_allowed_get(ps) && (shard->opts.min_purge_delay_ms > 0)) { nstime_t now; uint64_t delayns = shard->opts.min_purge_delay_ms * 1000 * 1000; shard->central->hooks.curtime(&now, /* first_reading */ true); nstime_iadd(&now, delayns); hpdata_time_purge_allowed_set(ps, &now); } hpdata_purge_allowed_set(ps, purgable); /* * Once a hugepage has become eligible for hugification, we don't mark * it as ineligible just because it stops meeting the criteria (this * could lead to situations where a hugepage that spends most of its * time meeting the criteria never quite getting hugified if there are * intervening deallocations). The idea is that the hugification delay * will allow them to get purged, resetting their "hugify-allowed" bit. * If they don't get purged, then the hugification isn't hurting and * might help. As an exception, we don't hugify hugepages that are now * empty; it definitely doesn't help there until the hugepage gets * reused, which is likely not for a while. */ if (hpdata_nactive_get(ps) == 0 && !hpa_should_assume_huge(shard, ps)) { hpdata_disallow_hugify(ps); } } static bool hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); return to_hugify != NULL || hpa_should_purge(tsdn, shard); } static inline bool hpa_needs_dehugify(hpa_shard_t *shard, const hpdata_t *ps) { return (hpa_is_hugify_lazy(shard) || opt_experimental_hpa_enforce_hugify) && hpdata_huge_get(ps) && !hpdata_empty(ps); } /* Prepare purge of one page. Return number of dirty regular pages on it * Return 0 if no purgable huge page is found * * If there was a page to purge its purge state is initialized */ static inline size_t hpa_purge_start_hp(hpa_purge_batch_t *b, hpa_shard_t *shard) { psset_t *psset = &shard->psset; hpdata_t *to_purge = (shard->opts.min_purge_delay_ms > 0) ? psset_pick_purge(psset, &shard->last_time_work_attempted) : psset_pick_purge(psset, NULL); if (to_purge == NULL) { return 0; } assert(hpdata_purge_allowed_get(to_purge)); assert(!hpdata_changing_state_get(to_purge)); /* * Don't let anyone else purge or hugify this page while * we're purging it (allocations and deallocations are * OK). */ psset_update_begin(psset, to_purge); assert(hpdata_alloc_allowed_get(to_purge)); hpdata_mid_purge_set(to_purge, true); hpdata_purge_allowed_set(to_purge, false); hpdata_disallow_hugify(to_purge); /* * Unlike with hugification (where concurrent * allocations are allowed), concurrent allocation out * of a hugepage being purged is unsafe; we might hand * out an extent for an allocation and then purge it * (clearing out user data). */ hpdata_alloc_allowed_set(to_purge, false); psset_update_end(psset, to_purge); assert(b->item_cnt < b->items_capacity); hpa_purge_item_t *hp_item = &b->items[b->item_cnt]; b->item_cnt++; hp_item->hp = to_purge; /* Gather all the metadata we'll need during the purge. */ hp_item->dehugify = hpa_needs_dehugify(shard, hp_item->hp); hpdata_purged_when_empty_and_huge_set(hp_item->hp, hpdata_huge_get(hp_item->hp) && hpdata_empty(hp_item->hp)); size_t nranges; size_t ndirty = hpdata_purge_begin( hp_item->hp, &hp_item->state, &nranges); /* We picked hp to purge, so it should have some dirty ranges */ assert(ndirty > 0 && nranges > 0); b->ndirty_in_batch += ndirty; b->nranges += nranges; return ndirty; } /* Finish purge of one huge page. */ static inline void hpa_purge_finish_hp( tsdn_t *tsdn, hpa_shard_t *shard, hpa_purge_item_t *hp_item) { if (hp_item->dehugify) { shard->stats.ndehugifies++; } /* The hpdata updates. */ psset_update_begin(&shard->psset, hp_item->hp); if (hpdata_huge_get(hp_item->hp)) { /* * Even when dehugify is not explicitly called, the page is * assumed to be non-huge after purge. */ hpdata_dehugify(hp_item->hp); } hpdata_purge_end(hp_item->hp, &hp_item->state); hpdata_mid_purge_set(hp_item->hp, false); hpdata_alloc_allowed_set(hp_item->hp, true); hpa_update_purge_hugify_eligibility(tsdn, shard, hp_item->hp); psset_update_end(&shard->psset, hp_item->hp); } /* Returns number of huge pages purged. */ static inline size_t hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) { malloc_mutex_assert_owner(tsdn, &shard->mtx); assert(max_hp > 0); assert(HPA_PURGE_BATCH_MAX > 0); assert(HPA_PURGE_BATCH_MAX < (VARIABLE_ARRAY_SIZE_MAX / sizeof(hpa_purge_item_t))); VARIABLE_ARRAY(hpa_purge_item_t, items, HPA_PURGE_BATCH_MAX); hpa_purge_batch_t batch = { .max_hp = max_hp, .npurged_hp_total = 0, .items = &items[0], .items_capacity = HPA_PURGE_BATCH_MAX, .range_watermark = hpa_process_madvise_max_iovec_len(), }; assert(batch.range_watermark > 0); while (1) { hpa_batch_pass_start(&batch); assert(hpa_batch_empty(&batch)); while ( !hpa_batch_full(&batch) && hpa_should_purge(tsdn, shard)) { size_t ndirty = hpa_purge_start_hp(&batch, shard); if (ndirty == 0) { break; } shard->npending_purge += ndirty; batch.npurged_hp_total++; } if (hpa_batch_empty(&batch)) { break; } hpa_hooks_t *hooks = &shard->central->hooks; malloc_mutex_unlock(tsdn, &shard->mtx); hpa_purge_batch(hooks, batch.items, batch.item_cnt); malloc_mutex_lock(tsdn, &shard->mtx); /* The shard updates */ shard->npending_purge -= batch.ndirty_in_batch; shard->stats.npurges += batch.ndirty_in_batch; shard->central->hooks.curtime(&shard->last_purge, /* first_reading */ false); for (size_t i = 0; i < batch.item_cnt; ++i) { hpa_purge_finish_hp(tsdn, shard, &batch.items[i]); } } malloc_mutex_assert_owner(tsdn, &shard->mtx); shard->stats.npurge_passes++; return batch.npurged_hp_total; } /* Returns whether or not we hugified anything. */ static bool hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) { return false; } hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); if (to_hugify == NULL) { return false; } assert(hpdata_hugify_allowed_get(to_hugify)); assert(!hpdata_changing_state_get(to_hugify)); /* Make sure that it's been hugifiable for long enough. */ nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify); uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed); if (millis < shard->opts.hugify_delay_ms) { return false; } /* * Don't let anyone else purge or hugify this page while * we're hugifying it (allocations and deallocations are * OK). */ psset_update_begin(&shard->psset, to_hugify); hpdata_mid_hugify_set(to_hugify, true); hpdata_purge_allowed_set(to_hugify, false); hpdata_disallow_hugify(to_hugify); assert(hpdata_alloc_allowed_get(to_hugify)); psset_update_end(&shard->psset, to_hugify); /* * Without lazy hugification, user relies on eagerly setting HG bit, or * leaving everything up to the kernel (ex: thp enabled=always). We * will still pretend that call succeeds to keep our accounting close to * what user believes is the truth on the target system, but we won't * update nhugifies stat as system call is not being made. */ if (hpa_is_hugify_lazy(shard) || opt_experimental_hpa_enforce_hugify) { malloc_mutex_unlock(tsdn, &shard->mtx); bool err = shard->central->hooks.hugify( hpdata_addr_get(to_hugify), HUGEPAGE, shard->opts.hugify_sync); malloc_mutex_lock(tsdn, &shard->mtx); shard->stats.nhugifies++; if (err) { /* * When asynchronous hugification is used * (shard->opts.hugify_sync option is false), we are not * expecting to get here, unless something went terrible * wrong. Because underlying syscall is only setting * kernel flag for memory range (actual hugification * happens asynchronously and we are not getting any * feedback about its outcome), we expect syscall to be * successful all the time. */ shard->stats.nhugify_failures++; } } psset_update_begin(&shard->psset, to_hugify); hpdata_hugify(to_hugify); hpdata_mid_hugify_set(to_hugify, false); hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify); psset_update_end(&shard->psset, to_hugify); return true; } static bool hpa_min_purge_interval_passed(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); uint64_t since_last_purge_ms = nstime_ms_between( &shard->last_purge, &shard->last_time_work_attempted); return since_last_purge_ms >= shard->opts.min_purge_interval_ms; } static inline void hpa_update_time_work_attempted(tsdn_t *tsdn, hpa_shard_t *shard) { malloc_mutex_assert_owner(tsdn, &shard->mtx); shard->central->hooks.curtime(&shard->last_time_work_attempted, /* first_reading */ false); } /* * Execution of deferred work is forced if it's triggered by an explicit * hpa_shard_do_deferred_work() call. */ static void hpa_shard_maybe_do_deferred_work( tsdn_t *tsdn, hpa_shard_t *shard, bool forced) { malloc_mutex_assert_owner(tsdn, &shard->mtx); if (!forced && shard->opts.deferral_allowed) { return; } hpa_update_time_work_attempted(tsdn, shard); /* * If we're on a background thread, do work so long as there's work to * be done. Otherwise, bound latency to not be *too* bad by doing at * most a small fixed number of operations. */ size_t max_ops = (forced ? (size_t)-1 : 16); size_t nops = 0; /* * Always purge before hugifying, to make sure we get some * ability to hit our quiescence targets. */ /* * Make sure we respect purge interval setting and don't purge * too frequently. */ if (hpa_min_purge_interval_passed(tsdn, shard)) { size_t max_purges = max_ops; /* * Limit number of hugepages (slabs) to purge. * When experimental_max_purge_nhp option is used, there is no * guarantee we'll always respect dirty_mult option. Option * experimental_max_purge_nhp provides a way to configure same * behavior as was possible before, with buggy implementation * of purging algorithm. */ ssize_t max_purge_nhp = shard->opts.experimental_max_purge_nhp; if (max_purge_nhp != -1 && max_purges > (size_t)max_purge_nhp) { max_purges = max_purge_nhp; } malloc_mutex_assert_owner(tsdn, &shard->mtx); nops += hpa_purge(tsdn, shard, max_purges); malloc_mutex_assert_owner(tsdn, &shard->mtx); } /* * Try to hugify at least once, even if we out of operations to make at * least some progress on hugification even at worst case. */ while (hpa_try_hugify(tsdn, shard) && nops < max_ops) { malloc_mutex_assert_owner(tsdn, &shard->mtx); nops++; } } static edata_t * hpa_try_alloc_one_no_grow( tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) { malloc_mutex_assert_owner(tsdn, &shard->mtx); bool err; edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf); if (edata == NULL) { *oom = true; return NULL; } hpdata_t *ps = psset_pick_alloc(&shard->psset, size); if (ps == NULL) { edata_cache_fast_put(tsdn, &shard->ecf, edata); return NULL; } psset_update_begin(&shard->psset, ps); if (hpdata_empty(ps)) { /* * If the pageslab used to be empty, treat it as though it's * brand new for fragmentation-avoidance purposes; what we're * trying to approximate is the age of the allocations *in* that * pageslab, and the allocations in the new pageslab are by * definition the youngest in this hpa shard. */ hpdata_age_set(ps, shard->age_counter++); } void *addr = hpdata_reserve_alloc(ps, size); JE_USDT(hpa_alloc, 5, shard->ind, addr, size, hpdata_nactive_get(ps), hpdata_age_get(ps)); edata_init(edata, shard->ind, addr, size, /* slab */ false, SC_NSIZES, /* sn */ hpdata_age_get(ps), extent_state_active, /* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA, EXTENT_NOT_HEAD); edata_ps_set(edata, ps); /* * This could theoretically be moved outside of the critical section, * but that introduces the potential for a race. Without the lock, the * (initially nonempty, since this is the reuse pathway) pageslab we * allocated out of could become otherwise empty while the lock is * dropped. This would force us to deal with a pageslab eviction down * the error pathway, which is a pain. */ err = emap_register_boundary( tsdn, shard->emap, edata, SC_NSIZES, /* slab */ false); if (err) { hpdata_unreserve( ps, edata_addr_get(edata), edata_size_get(edata)); JE_USDT(hpa_dalloc_err, 5, shard->ind, edata_addr_get(edata), edata_size_get(edata), hpdata_nactive_get(ps), hpdata_age_get(ps)); /* * We should arguably reset dirty state here, but this would * require some sort of prepare + commit functionality that's a * little much to deal with for now. * * We don't have a do_deferred_work down this pathway, on the * principle that we didn't *really* affect shard state (we * tweaked the stats, but our tweaks weren't really accurate). */ psset_update_end(&shard->psset, ps); edata_cache_fast_put(tsdn, &shard->ecf, edata); *oom = true; return NULL; } hpa_update_purge_hugify_eligibility(tsdn, shard, ps); psset_update_end(&shard->psset, ps); return edata; } static size_t hpa_try_alloc_batch_no_grow_locked(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom, size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated) { malloc_mutex_assert_owner(tsdn, &shard->mtx); size_t nsuccess = 0; for (; nsuccess < nallocs; nsuccess++) { edata_t *edata = hpa_try_alloc_one_no_grow( tsdn, shard, size, oom); if (edata == NULL) { break; } edata_list_active_append(results, edata); } hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); return nsuccess; } static size_t hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom, size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated) { malloc_mutex_lock(tsdn, &shard->mtx); size_t nsuccess = hpa_try_alloc_batch_no_grow_locked( tsdn, shard, size, oom, nallocs, results, deferred_work_generated); malloc_mutex_unlock(tsdn, &shard->mtx); return nsuccess; } static size_t hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, size_t nallocs, edata_list_active_t *results, bool *deferred_work_generated) { assert(size <= HUGEPAGE); assert(size <= shard->opts.slab_max_alloc || size == sz_s2u(size)); bool oom = false; size_t nsuccess = hpa_try_alloc_batch_no_grow( tsdn, shard, size, &oom, nallocs, results, deferred_work_generated); if (nsuccess == nallocs || oom) { return nsuccess; } /* * We didn't OOM, but weren't able to fill everything requested of us; * try to grow. */ malloc_mutex_lock(tsdn, &shard->grow_mtx); /* * Check for grow races; maybe some earlier thread expanded the psset * in between when we dropped the main mutex and grabbed the grow mutex. */ nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom, nallocs - nsuccess, results, deferred_work_generated); if (nsuccess == nallocs || oom) { malloc_mutex_unlock(tsdn, &shard->grow_mtx); return nsuccess; } /* * Note that we don't hold shard->mtx here (while growing); * deallocations (and allocations of smaller sizes) may still succeed * while we're doing this potentially expensive system call. */ hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size, shard->age_counter++, hpa_is_hugify_eager(shard), &oom); if (ps == NULL) { malloc_mutex_unlock(tsdn, &shard->grow_mtx); return nsuccess; } /* * We got the pageslab; allocate from it. This holds the grow mutex * while doing deferred work, but this is an uncommon path; the * simplicity is worth it. */ malloc_mutex_lock(tsdn, &shard->mtx); psset_insert(&shard->psset, ps); nsuccess += hpa_try_alloc_batch_no_grow_locked(tsdn, shard, size, &oom, nallocs - nsuccess, results, deferred_work_generated); malloc_mutex_unlock(tsdn, &shard->mtx); /* * Drop grow_mtx before doing deferred work; other threads blocked on it * should be allowed to proceed while we're working. */ malloc_mutex_unlock(tsdn, &shard->grow_mtx); return nsuccess; } static void hpa_assert_results( tsdn_t *tsdn, hpa_shard_t *shard, edata_list_active_t *results) { /* * Guard the sanity checks with config_debug because the loop cannot be * proven non-circular by the compiler, even if everything within the * loop is optimized away. */ if (config_debug) { edata_t *edata; ql_foreach (edata, &results->head, ql_link_active) { emap_assert_mapped(tsdn, shard->emap, edata); assert(edata_pai_get(edata) == EXTENT_PAI_HPA); assert(edata_state_get(edata) == extent_state_active); assert(edata_arena_ind_get(edata) == shard->ind); assert( edata_szind_get_maybe_invalid(edata) == SC_NSIZES); assert(!edata_slab_get(edata)); assert(edata_committed_get(edata)); assert(edata_base_get(edata) == edata_addr_get(edata)); assert(edata_base_get(edata) != NULL); } } } edata_t * hpa_alloc(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, size_t alignment, bool zero, bool guarded, bool frequent_reuse, bool *deferred_work_generated) { assert((size & PAGE_MASK) == 0); assert(!guarded); witness_assert_depth_to_rank( tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0); /* We don't handle alignment or zeroing for now. */ if (alignment > PAGE || zero) { return NULL; } /* * frequent_use here indicates this request comes from the arena bins, * in which case it will be split into slabs, and therefore there is no * intrinsic slack in the allocation (the entire range of allocated size * will be accessed). * * In this case bypass the slab_max_alloc limit (if still within the * huge page size). These requests do not concern internal * fragmentation with huge pages (again, the full size will be used). */ if (!(frequent_reuse && size <= HUGEPAGE) && (size > shard->opts.slab_max_alloc)) { return NULL; } edata_t *edata = sec_alloc(tsdn, &shard->sec, size); if (edata != NULL) { return edata; } size_t nallocs = sec_size_supported(&shard->sec, size) ? shard->sec.opts.batch_fill_extra + 1 : 1; edata_list_active_t results; edata_list_active_init(&results); size_t nsuccess = hpa_alloc_batch_psset( tsdn, shard, size, nallocs, &results, deferred_work_generated); hpa_assert_results(tsdn, shard, &results); edata = edata_list_active_first(&results); if (edata != NULL) { edata_list_active_remove(&results, edata); assert(nsuccess > 0); nsuccess--; } if (nsuccess > 0) { assert(sec_size_supported(&shard->sec, size)); sec_fill(tsdn, &shard->sec, size, &results, nsuccess); /* Unlikely rollback in case of overfill */ if (!edata_list_active_empty(&results)) { hpa_dalloc_batch( tsdn, shard, &results, deferred_work_generated); } } witness_assert_depth_to_rank( tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0); return edata; } bool hpa_expand(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata, size_t old_size, size_t new_size, bool zero, bool *deferred_work_generated) { /* Expand not yet supported. */ return true; } bool hpa_shrink(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata, size_t old_size, size_t new_size, bool *deferred_work_generated) { /* Shrink not yet supported. */ return true; } static void hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { malloc_mutex_assert_not_owner(tsdn, &shard->mtx); assert(edata_pai_get(edata) == EXTENT_PAI_HPA); assert(edata_state_get(edata) == extent_state_active); assert(edata_arena_ind_get(edata) == shard->ind); assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES); assert(edata_committed_get(edata)); assert(edata_base_get(edata) != NULL); /* * Another thread shouldn't be trying to touch the metadata of an * allocation being freed. The one exception is a merge attempt from a * lower-addressed PAC extent; in this case we have a nominal race on * the edata metadata bits, but in practice the fact that the PAI bits * are different will prevent any further access. The race is bad, but * benign in practice, and the long term plan is to track enough state * in the rtree to prevent these merge attempts in the first place. */ edata_addr_set(edata, edata_base_get(edata)); edata_zeroed_set(edata, false); emap_deregister_boundary(tsdn, shard->emap, edata); } static void hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) { malloc_mutex_assert_owner(tsdn, &shard->mtx); /* * Release the metadata early, to avoid having to remember to do it * while we're also doing tricky purging logic. First, we need to grab * a few bits of metadata from it. * * Note that the shard mutex protects ps's metadata too; it wouldn't be * correct to try to read most information out of it without the lock. */ hpdata_t *ps = edata_ps_get(edata); /* Currently, all edatas come from pageslabs. */ assert(ps != NULL); void *unreserve_addr = edata_addr_get(edata); size_t unreserve_size = edata_size_get(edata); edata_cache_fast_put(tsdn, &shard->ecf, edata); psset_update_begin(&shard->psset, ps); hpdata_unreserve(ps, unreserve_addr, unreserve_size); JE_USDT(hpa_dalloc, 5, shard->ind, unreserve_addr, unreserve_size, hpdata_nactive_get(ps), hpdata_age_get(ps)); hpa_update_purge_hugify_eligibility(tsdn, shard, ps); psset_update_end(&shard->psset, ps); } static void hpa_dalloc_batch(tsdn_t *tsdn, hpa_shard_t *shard, edata_list_active_t *list, bool *deferred_work_generated) { edata_t *edata; ql_foreach (edata, &list->head, ql_link_active) { hpa_dalloc_prepare_unlocked(tsdn, shard, edata); } malloc_mutex_lock(tsdn, &shard->mtx); /* Now, remove from the list. */ while ((edata = edata_list_active_first(list)) != NULL) { edata_list_active_remove(list, edata); hpa_dalloc_locked(tsdn, shard, edata); } hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false); *deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard); malloc_mutex_unlock(tsdn, &shard->mtx); } void hpa_dalloc(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata, bool *deferred_work_generated) { assert(!edata_guarded_get(edata)); edata_list_active_t dalloc_list; edata_list_active_init(&dalloc_list); edata_list_active_append(&dalloc_list, edata); sec_dalloc(tsdn, &shard->sec, &dalloc_list); if (edata_list_active_empty(&dalloc_list)) { /* sec consumed the pointer */ *deferred_work_generated = false; return; } /* We may have more than one pointer to flush now */ hpa_dalloc_batch(tsdn, shard, &dalloc_list, deferred_work_generated); } /* * Calculate time until either purging or hugification ought to happen. * Called by background threads. */ uint64_t hpa_time_until_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX; malloc_mutex_lock(tsdn, &shard->mtx); hpdata_t *to_hugify = psset_pick_hugify(&shard->psset); if (to_hugify != NULL) { nstime_t time_hugify_allowed = hpdata_time_hugify_allowed( to_hugify); uint64_t since_hugify_allowed_ms = shard->central->hooks.ms_since(&time_hugify_allowed); /* * If not enough time has passed since hugification was allowed, * sleep for the rest. */ if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) { time_ns = shard->opts.hugify_delay_ms - since_hugify_allowed_ms; time_ns *= 1000 * 1000; } else { malloc_mutex_unlock(tsdn, &shard->mtx); return BACKGROUND_THREAD_DEFERRED_MIN; } } if (hpa_should_purge(tsdn, shard)) { /* * If we haven't purged before, no need to check interval * between purges. Simply purge as soon as possible. */ if (shard->stats.npurge_passes == 0) { malloc_mutex_unlock(tsdn, &shard->mtx); return BACKGROUND_THREAD_DEFERRED_MIN; } uint64_t since_last_purge_ms = shard->central->hooks.ms_since( &shard->last_purge); if (since_last_purge_ms < shard->opts.min_purge_interval_ms) { uint64_t until_purge_ns; until_purge_ns = shard->opts.min_purge_interval_ms - since_last_purge_ms; until_purge_ns *= 1000 * 1000; if (until_purge_ns < time_ns) { time_ns = until_purge_ns; } } else { time_ns = BACKGROUND_THREAD_DEFERRED_MIN; } } malloc_mutex_unlock(tsdn, &shard->mtx); return time_ns; } static void hpa_sec_flush_impl(tsdn_t *tsdn, hpa_shard_t *shard) { edata_list_active_t to_flush; edata_list_active_init(&to_flush); sec_flush(tsdn, &shard->sec, &to_flush); bool deferred_work_generated; hpa_dalloc_batch(tsdn, shard, &to_flush, &deferred_work_generated); } void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); hpa_sec_flush_impl(tsdn, shard); malloc_mutex_lock(tsdn, &shard->mtx); edata_cache_fast_disable(tsdn, &shard->ecf); malloc_mutex_unlock(tsdn, &shard->mtx); } void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_sec_flush_impl(tsdn, shard); } static void hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) { assert(bin_stats->npageslabs == 0); assert(bin_stats->nactive == 0); } static void hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) { malloc_mutex_assert_owner(tsdn, &shard->mtx); for (int huge = 0; huge <= 1; huge++) { hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]); for (pszind_t i = 0; i < PSSET_NPSIZES; i++) { hpa_shard_assert_stats_empty( &psset->stats.nonfull_slabs[i][huge]); } } } void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); hpa_shard_flush(tsdn, shard); /* * By the time we're here, the arena code should have dalloc'd all the * active extents, which means we should have eventually evicted * everything from the psset, so it shouldn't be able to serve even a * 1-page allocation. */ if (config_debug) { malloc_mutex_lock(tsdn, &shard->mtx); hpa_assert_empty(tsdn, shard, &shard->psset); malloc_mutex_unlock(tsdn, &shard->mtx); } hpdata_t *ps; while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) { /* There should be no allocations anywhere. */ assert(hpdata_empty(ps)); psset_remove(&shard->psset, ps); shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE); } } void hpa_shard_set_deferral_allowed( tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed) { hpa_do_consistency_checks(shard); malloc_mutex_lock(tsdn, &shard->mtx); bool deferral_previously_allowed = shard->opts.deferral_allowed; shard->opts.deferral_allowed = deferral_allowed; if (deferral_previously_allowed && !deferral_allowed) { hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); } malloc_mutex_unlock(tsdn, &shard->mtx); } void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); malloc_mutex_lock(tsdn, &shard->mtx); hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true); malloc_mutex_unlock(tsdn, &shard->mtx); } void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); sec_prefork2(tsdn, &shard->sec); } void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); malloc_mutex_prefork(tsdn, &shard->grow_mtx); } void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); malloc_mutex_prefork(tsdn, &shard->mtx); } void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); sec_postfork_parent(tsdn, &shard->sec); malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx); malloc_mutex_postfork_parent(tsdn, &shard->mtx); } void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) { hpa_do_consistency_checks(shard); sec_postfork_child(tsdn, &shard->sec); malloc_mutex_postfork_child(tsdn, &shard->grow_mtx); malloc_mutex_postfork_child(tsdn, &shard->mtx); }