mirror of
https://github.com/jemalloc/jemalloc.git
synced 2026-06-03 02:34:17 +03:00
This change includes the following improvements: - Remove the hpa_sec_batch_fill_extra parameter. - Refactor the hpa_alloc() code and helper functions to be able to allocate more than one extent out of a single pageslab. This way we can amortize the per-pageslab costs (active bitmap iteration, pageslab metadata updates) across multiple extents. - Decide on a min and max number of extents that will be allocated in hpa_alloc(). The code will try to allocate at least the min and allocate up to the max as long as we can allocate additional ones from the pageslab we already have, as additional allocations are relatively cheap. - Add extent allocation distribution stats. - Amend hpa_sec_integration.c unit test.
1274 lines
40 KiB
C
1274 lines
40 KiB
C
#include "jemalloc/internal/jemalloc_preamble.h"
|
|
#include "jemalloc/internal/jemalloc_internal_includes.h"
|
|
|
|
#include "jemalloc/internal/hpa.h"
|
|
#include "jemalloc/internal/hpa_utils.h"
|
|
|
|
#include "jemalloc/internal/fb.h"
|
|
#include "jemalloc/internal/witness.h"
|
|
#include "jemalloc/internal/jemalloc_probe.h"
|
|
|
|
static void hpa_dalloc_batch(tsdn_t *tsdn, hpa_shard_t *shard,
|
|
edata_list_active_t *list, bool *deferred_work_generated);
|
|
|
|
const char *const hpa_hugify_style_names[] = {"auto", "none", "eager", "lazy"};
|
|
|
|
bool opt_experimental_hpa_start_huge_if_thp_always = true;
|
|
bool opt_experimental_hpa_enforce_hugify = false;
|
|
|
|
bool
|
|
hpa_hugepage_size_exceeds_limit(void) {
|
|
return HUGEPAGE > HUGEPAGE_MAX_EXPECTED_SIZE;
|
|
}
|
|
|
|
bool
|
|
hpa_supported(void) {
|
|
#ifdef _WIN32
|
|
/*
|
|
* At least until the API and implementation is somewhat settled, we
|
|
* don't want to try to debug the VM subsystem on the hardest-to-test
|
|
* platform.
|
|
*/
|
|
return false;
|
|
#endif
|
|
if (!pages_can_hugify) {
|
|
return false;
|
|
}
|
|
/*
|
|
* We fundamentally rely on an address-space-hungry growth strategy for
|
|
* hugepages.
|
|
*/
|
|
if (LG_SIZEOF_PTR != 3) {
|
|
return false;
|
|
}
|
|
/*
|
|
* If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
|
|
* this sentinel value -- see the comment in pages.h.
|
|
*/
|
|
if (HUGEPAGE_PAGES == 1) {
|
|
return false;
|
|
}
|
|
/* As mentioned in pages.h, do not support If HUGEPAGE is too large. */
|
|
if (hpa_hugepage_size_exceeds_limit()) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
hpa_do_consistency_checks(hpa_shard_t *shard) {
|
|
assert(shard->base != NULL);
|
|
}
|
|
|
|
bool
|
|
hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
|
|
emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
|
|
const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts) {
|
|
/* malloc_conf processing should have filtered out these cases. */
|
|
assert(hpa_supported());
|
|
bool err;
|
|
err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
|
|
WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
|
|
if (err) {
|
|
return true;
|
|
}
|
|
err = malloc_mutex_init(&shard->mtx, "hpa_shard",
|
|
WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
|
|
if (err) {
|
|
return true;
|
|
}
|
|
|
|
assert(edata_cache != NULL);
|
|
shard->central = central;
|
|
shard->base = base;
|
|
edata_cache_fast_init(&shard->ecf, edata_cache);
|
|
psset_init(&shard->psset);
|
|
shard->age_counter = 0;
|
|
shard->ind = ind;
|
|
shard->emap = emap;
|
|
|
|
shard->opts = *opts;
|
|
|
|
shard->npending_purge = 0;
|
|
nstime_init_zero(&shard->last_purge);
|
|
nstime_init_zero(&shard->last_time_work_attempted);
|
|
|
|
shard->stats.npurge_passes = 0;
|
|
shard->stats.npurges = 0;
|
|
shard->stats.nhugifies = 0;
|
|
shard->stats.nhugify_failures = 0;
|
|
shard->stats.ndehugifies = 0;
|
|
memset(shard->stats.hpa_alloc_min_extents, 0,
|
|
sizeof(shard->stats.hpa_alloc_min_extents));
|
|
memset(shard->stats.hpa_alloc_max_extents, 0,
|
|
sizeof(shard->stats.hpa_alloc_max_extents));
|
|
memset(shard->stats.hpa_alloc_extents, 0,
|
|
sizeof(shard->stats.hpa_alloc_extents));
|
|
memset(shard->stats.hpa_alloc_ps, 0, sizeof(shard->stats.hpa_alloc_ps));
|
|
memset(shard->stats.hpa_alloc_pages_per_ps, 0,
|
|
sizeof(shard->stats.hpa_alloc_pages_per_ps));
|
|
memset(shard->stats.hpa_alloc_extents_per_ps, 0,
|
|
sizeof(shard->stats.hpa_alloc_extents_per_ps));
|
|
memset(shard->stats.hpa_alloc_total_elapsed_ns_per_ps, 0,
|
|
sizeof(shard->stats.hpa_alloc_total_elapsed_ns_per_ps));
|
|
|
|
err = sec_init(tsdn, &shard->sec, base, sec_opts);
|
|
if (err) {
|
|
return true;
|
|
}
|
|
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Note that the stats functions here follow the usual stats naming conventions;
|
|
* "merge" obtains the stats from some live object of instance, while "accum"
|
|
* only combines the stats from one stats object to another. Hence the lack of
|
|
* locking here.
|
|
*/
|
|
static void
|
|
hpa_shard_nonderived_stats_accum(
|
|
hpa_shard_nonderived_stats_t *dst, hpa_shard_nonderived_stats_t *src) {
|
|
dst->npurge_passes += src->npurge_passes;
|
|
dst->npurges += src->npurges;
|
|
dst->nhugifies += src->nhugifies;
|
|
dst->nhugify_failures += src->nhugify_failures;
|
|
dst->ndehugifies += src->ndehugifies;
|
|
for (size_t i = 0; i <= SEC_MAX_NALLOCS; i++) {
|
|
dst->hpa_alloc_min_extents[i] += src->hpa_alloc_min_extents[i];
|
|
dst->hpa_alloc_max_extents[i] += src->hpa_alloc_max_extents[i];
|
|
dst->hpa_alloc_extents[i] += src->hpa_alloc_extents[i];
|
|
dst->hpa_alloc_ps[i] += src->hpa_alloc_ps[i];
|
|
dst->hpa_alloc_pages_per_ps[i] +=
|
|
src->hpa_alloc_pages_per_ps[i];
|
|
dst->hpa_alloc_extents_per_ps[i] +=
|
|
src->hpa_alloc_extents_per_ps[i];
|
|
dst->hpa_alloc_total_elapsed_ns_per_ps[i] +=
|
|
src->hpa_alloc_total_elapsed_ns_per_ps[i];
|
|
}
|
|
}
|
|
|
|
void
|
|
hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
|
|
psset_stats_accum(&dst->psset_stats, &src->psset_stats);
|
|
hpa_shard_nonderived_stats_accum(
|
|
&dst->nonderived_stats, &src->nonderived_stats);
|
|
sec_stats_accum(&dst->secstats, &src->secstats);
|
|
}
|
|
|
|
void
|
|
hpa_shard_stats_merge(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
malloc_mutex_lock(tsdn, &shard->grow_mtx);
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
|
|
hpa_shard_nonderived_stats_accum(&dst->nonderived_stats, &shard->stats);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
|
|
|
|
sec_stats_merge(tsdn, &shard->sec, &dst->secstats);
|
|
}
|
|
|
|
static bool
|
|
hpa_is_hugify_eager(hpa_shard_t *shard) {
|
|
return shard->opts.hugify_style == hpa_hugify_style_eager;
|
|
}
|
|
|
|
static bool
|
|
hpa_is_hugify_lazy(hpa_shard_t *shard) {
|
|
/* When hugify_sync==true we also set/unset HG bit manually */
|
|
return shard->opts.hugify_style == hpa_hugify_style_lazy
|
|
|| shard->opts.hugify_sync;
|
|
}
|
|
|
|
static bool
|
|
hpa_is_hugify_none(hpa_shard_t *shard) {
|
|
return shard->opts.hugify_style == hpa_hugify_style_none;
|
|
}
|
|
|
|
/*
|
|
* Experimentation has shown that when we are purging only HUGEPAGE ranges and
|
|
* hugifying eagerly (or thp enabled=always) we get huge pages more often. This
|
|
* helps us have more realistic accounting.
|
|
*/
|
|
static bool
|
|
hpa_should_assume_huge(hpa_shard_t *shard, const hpdata_t *ps) {
|
|
return (hpa_is_hugify_eager(shard) || hpa_is_hugify_none(shard))
|
|
&& hpdata_purged_when_empty_and_huge_get(ps);
|
|
}
|
|
|
|
static bool
|
|
hpa_good_hugification_candidate(hpa_shard_t *shard, hpdata_t *ps) {
|
|
/*
|
|
* Note that this needs to be >= rather than just >, because of the
|
|
* important special case in which the hugification threshold is exactly
|
|
* HUGEPAGE.
|
|
*/
|
|
return hpdata_nactive_get(ps) * PAGE
|
|
>= shard->opts.hugification_threshold;
|
|
}
|
|
|
|
static bool
|
|
hpa_good_purge_candidate(hpa_shard_t *shard, hpdata_t *ps) {
|
|
if (shard->opts.dirty_mult == (fxp_t)-1) {
|
|
/* No purging. */
|
|
return false;
|
|
}
|
|
size_t ndirty = hpdata_ndirty_get(ps);
|
|
/* Empty pages are good candidate for purging. */
|
|
if (ndirty > 0 && hpdata_empty(ps)) {
|
|
return true;
|
|
}
|
|
return ndirty * PAGE >= shard->opts.purge_threshold;
|
|
}
|
|
|
|
static size_t
|
|
hpa_adjusted_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
return psset_ndirty(&shard->psset) - shard->npending_purge;
|
|
}
|
|
|
|
static size_t
|
|
hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
if (shard->opts.dirty_mult == (fxp_t)-1) {
|
|
return (size_t)-1;
|
|
}
|
|
return fxp_mul_frac(
|
|
psset_nactive(&shard->psset), shard->opts.dirty_mult);
|
|
}
|
|
|
|
static bool
|
|
hpa_hugify_blocked_by_ndirty(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
|
|
if (to_hugify == NULL) {
|
|
return false;
|
|
}
|
|
return hpa_adjusted_ndirty(tsdn, shard)
|
|
+ hpdata_nretained_get(to_hugify)
|
|
> hpa_ndirty_max(tsdn, shard);
|
|
}
|
|
|
|
static bool
|
|
hpa_should_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
/*
|
|
* The page that is purgable may be delayed, but we just want to know
|
|
* if there is a need for bg thread to wake up in the future.
|
|
*/
|
|
hpdata_t *ps = psset_pick_purge(&shard->psset, NULL);
|
|
if (ps == NULL) {
|
|
return false;
|
|
}
|
|
if (hpa_adjusted_ndirty(tsdn, shard) > hpa_ndirty_max(tsdn, shard)) {
|
|
return true;
|
|
}
|
|
if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static void
|
|
hpa_assume_huge(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
assert(hpa_should_assume_huge(shard, ps));
|
|
if (hpdata_huge_get(ps) || hpdata_empty(ps)) {
|
|
return;
|
|
}
|
|
|
|
if (hpdata_ntouched_get(ps) != HUGEPAGE_PAGES) {
|
|
hpdata_hugify(ps);
|
|
}
|
|
}
|
|
|
|
static void
|
|
hpa_update_purge_hugify_eligibility(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
if (hpdata_changing_state_get(ps)) {
|
|
hpdata_purge_allowed_set(ps, false);
|
|
hpdata_disallow_hugify(ps);
|
|
return;
|
|
}
|
|
/*
|
|
* Hugepages are distinctly costly to purge, so try to avoid it unless
|
|
* they're *particularly* full of dirty pages. Eventually, we should
|
|
* use a smarter / more dynamic heuristic for situations where we have
|
|
* to manually hugify.
|
|
*
|
|
* In situations where we don't manually hugify, this problem is
|
|
* reduced. The "bad" situation we're trying to avoid is one's that's
|
|
* common in some Linux configurations (where both enabled and defrag
|
|
* are set to madvise) that can lead to long latency spikes on the first
|
|
* access after a hugification. The ideal policy in such configurations
|
|
* is probably time-based for both purging and hugifying; only hugify a
|
|
* hugepage if it's met the criteria for some extended period of time,
|
|
* and only dehugify it if it's failed to meet the criteria for an
|
|
* extended period of time. When background threads are on, we should
|
|
* try to take this hit on one of them, as well.
|
|
*
|
|
* I think the ideal setting is THP always enabled, and defrag set to
|
|
* deferred; in that case we don't need any explicit calls on the
|
|
* allocator's end at all; we just try to pack allocations in a
|
|
* hugepage-friendly manner and let the OS hugify in the background.
|
|
*/
|
|
if (hpa_should_assume_huge(shard, ps)) {
|
|
/* Assume it is huge without the need to madvise */
|
|
hpa_assume_huge(tsdn, shard, ps);
|
|
}
|
|
if ((hpa_is_hugify_lazy(shard) || opt_experimental_hpa_enforce_hugify)
|
|
&& hpa_good_hugification_candidate(shard, ps)
|
|
&& !hpdata_huge_get(ps)) {
|
|
nstime_t now;
|
|
shard->central->hooks.curtime(&now, /* first_reading */ true);
|
|
hpdata_allow_hugify(ps, now);
|
|
}
|
|
bool purgable = hpa_good_purge_candidate(shard, ps);
|
|
if (purgable && !hpdata_purge_allowed_get(ps)
|
|
&& (shard->opts.min_purge_delay_ms > 0)) {
|
|
nstime_t now;
|
|
uint64_t delayns = shard->opts.min_purge_delay_ms * 1000 * 1000;
|
|
shard->central->hooks.curtime(&now, /* first_reading */ true);
|
|
nstime_iadd(&now, delayns);
|
|
hpdata_time_purge_allowed_set(ps, &now);
|
|
}
|
|
hpdata_purge_allowed_set(ps, purgable);
|
|
|
|
/*
|
|
* Once a hugepage has become eligible for hugification, we don't mark
|
|
* it as ineligible just because it stops meeting the criteria (this
|
|
* could lead to situations where a hugepage that spends most of its
|
|
* time meeting the criteria never quite getting hugified if there are
|
|
* intervening deallocations). The idea is that the hugification delay
|
|
* will allow them to get purged, resetting their "hugify-allowed" bit.
|
|
* If they don't get purged, then the hugification isn't hurting and
|
|
* might help. As an exception, we don't hugify hugepages that are now
|
|
* empty; it definitely doesn't help there until the hugepage gets
|
|
* reused, which is likely not for a while.
|
|
*/
|
|
if (hpdata_nactive_get(ps) == 0 && !hpa_should_assume_huge(shard, ps)) {
|
|
hpdata_disallow_hugify(ps);
|
|
}
|
|
}
|
|
|
|
static bool
|
|
hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
|
|
return to_hugify != NULL || hpa_should_purge(tsdn, shard);
|
|
}
|
|
|
|
static inline bool
|
|
hpa_needs_dehugify(hpa_shard_t *shard, const hpdata_t *ps) {
|
|
return (hpa_is_hugify_lazy(shard)
|
|
|| opt_experimental_hpa_enforce_hugify)
|
|
&& hpdata_huge_get(ps) && !hpdata_empty(ps);
|
|
}
|
|
|
|
/* Prepare purge of one page. Return number of dirty regular pages on it
|
|
* Return 0 if no purgable huge page is found
|
|
*
|
|
* If there was a page to purge its purge state is initialized
|
|
*/
|
|
static inline size_t
|
|
hpa_purge_start_hp(hpa_purge_batch_t *b, hpa_shard_t *shard) {
|
|
psset_t *psset = &shard->psset;
|
|
hpdata_t *to_purge = (shard->opts.min_purge_delay_ms > 0)
|
|
? psset_pick_purge(psset, &shard->last_time_work_attempted)
|
|
: psset_pick_purge(psset, NULL);
|
|
if (to_purge == NULL) {
|
|
return 0;
|
|
}
|
|
assert(hpdata_purge_allowed_get(to_purge));
|
|
assert(!hpdata_changing_state_get(to_purge));
|
|
|
|
/*
|
|
* Don't let anyone else purge or hugify this page while
|
|
* we're purging it (allocations and deallocations are
|
|
* OK).
|
|
*/
|
|
psset_update_begin(psset, to_purge);
|
|
assert(hpdata_alloc_allowed_get(to_purge));
|
|
hpdata_mid_purge_set(to_purge, true);
|
|
hpdata_purge_allowed_set(to_purge, false);
|
|
hpdata_disallow_hugify(to_purge);
|
|
/*
|
|
* Unlike with hugification (where concurrent
|
|
* allocations are allowed), concurrent allocation out
|
|
* of a hugepage being purged is unsafe; we might hand
|
|
* out an extent for an allocation and then purge it
|
|
* (clearing out user data).
|
|
*/
|
|
hpdata_alloc_allowed_set(to_purge, false);
|
|
psset_update_end(psset, to_purge);
|
|
|
|
assert(b->item_cnt < b->items_capacity);
|
|
hpa_purge_item_t *hp_item = &b->items[b->item_cnt];
|
|
b->item_cnt++;
|
|
hp_item->hp = to_purge;
|
|
/* Gather all the metadata we'll need during the purge. */
|
|
hp_item->dehugify = hpa_needs_dehugify(shard, hp_item->hp);
|
|
hpdata_purged_when_empty_and_huge_set(hp_item->hp,
|
|
hpdata_huge_get(hp_item->hp) && hpdata_empty(hp_item->hp));
|
|
size_t nranges;
|
|
size_t ndirty = hpdata_purge_begin(
|
|
hp_item->hp, &hp_item->state, &nranges);
|
|
/* We picked hp to purge, so it should have some dirty ranges */
|
|
assert(ndirty > 0 && nranges > 0);
|
|
b->ndirty_in_batch += ndirty;
|
|
b->nranges += nranges;
|
|
return ndirty;
|
|
}
|
|
|
|
/* Finish purge of one huge page. */
|
|
static inline void
|
|
hpa_purge_finish_hp(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, hpa_purge_item_t *hp_item) {
|
|
if (hp_item->dehugify) {
|
|
shard->stats.ndehugifies++;
|
|
}
|
|
/* The hpdata updates. */
|
|
psset_update_begin(&shard->psset, hp_item->hp);
|
|
if (hpdata_huge_get(hp_item->hp)) {
|
|
/*
|
|
* Even when dehugify is not explicitly called, the page is
|
|
* assumed to be non-huge after purge.
|
|
*/
|
|
hpdata_dehugify(hp_item->hp);
|
|
}
|
|
hpdata_purge_end(hp_item->hp, &hp_item->state);
|
|
hpdata_mid_purge_set(hp_item->hp, false);
|
|
|
|
hpdata_alloc_allowed_set(hp_item->hp, true);
|
|
hpa_update_purge_hugify_eligibility(tsdn, shard, hp_item->hp);
|
|
|
|
psset_update_end(&shard->psset, hp_item->hp);
|
|
}
|
|
|
|
/* Returns number of huge pages purged. */
|
|
static inline size_t
|
|
hpa_purge(tsdn_t *tsdn, hpa_shard_t *shard, size_t max_hp) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
assert(max_hp > 0);
|
|
|
|
assert(HPA_PURGE_BATCH_MAX > 0);
|
|
assert(HPA_PURGE_BATCH_MAX
|
|
< (VARIABLE_ARRAY_SIZE_MAX / sizeof(hpa_purge_item_t)));
|
|
VARIABLE_ARRAY(hpa_purge_item_t, items, HPA_PURGE_BATCH_MAX);
|
|
hpa_purge_batch_t batch = {
|
|
.max_hp = max_hp,
|
|
.npurged_hp_total = 0,
|
|
.items = &items[0],
|
|
.items_capacity = HPA_PURGE_BATCH_MAX,
|
|
.range_watermark = hpa_process_madvise_max_iovec_len(),
|
|
};
|
|
assert(batch.range_watermark > 0);
|
|
|
|
while (1) {
|
|
hpa_batch_pass_start(&batch);
|
|
assert(hpa_batch_empty(&batch));
|
|
while (
|
|
!hpa_batch_full(&batch) && hpa_should_purge(tsdn, shard)) {
|
|
size_t ndirty = hpa_purge_start_hp(&batch, shard);
|
|
if (ndirty == 0) {
|
|
break;
|
|
}
|
|
shard->npending_purge += ndirty;
|
|
batch.npurged_hp_total++;
|
|
}
|
|
|
|
if (hpa_batch_empty(&batch)) {
|
|
break;
|
|
}
|
|
hpa_hooks_t *hooks = &shard->central->hooks;
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
hpa_purge_batch(hooks, batch.items, batch.item_cnt);
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
|
|
/* The shard updates */
|
|
shard->npending_purge -= batch.ndirty_in_batch;
|
|
shard->stats.npurges += batch.ndirty_in_batch;
|
|
shard->central->hooks.curtime(&shard->last_purge,
|
|
/* first_reading */ false);
|
|
for (size_t i = 0; i < batch.item_cnt; ++i) {
|
|
hpa_purge_finish_hp(tsdn, shard, &batch.items[i]);
|
|
}
|
|
}
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
shard->stats.npurge_passes++;
|
|
return batch.npurged_hp_total;
|
|
}
|
|
|
|
/* Returns whether or not we hugified anything. */
|
|
static bool
|
|
hpa_try_hugify(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
if (hpa_hugify_blocked_by_ndirty(tsdn, shard)) {
|
|
return false;
|
|
}
|
|
|
|
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
|
|
if (to_hugify == NULL) {
|
|
return false;
|
|
}
|
|
assert(hpdata_hugify_allowed_get(to_hugify));
|
|
assert(!hpdata_changing_state_get(to_hugify));
|
|
|
|
/* Make sure that it's been hugifiable for long enough. */
|
|
nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(to_hugify);
|
|
uint64_t millis = shard->central->hooks.ms_since(&time_hugify_allowed);
|
|
if (millis < shard->opts.hugify_delay_ms) {
|
|
return false;
|
|
}
|
|
|
|
/*
|
|
* Don't let anyone else purge or hugify this page while
|
|
* we're hugifying it (allocations and deallocations are
|
|
* OK).
|
|
*/
|
|
psset_update_begin(&shard->psset, to_hugify);
|
|
hpdata_mid_hugify_set(to_hugify, true);
|
|
hpdata_purge_allowed_set(to_hugify, false);
|
|
hpdata_disallow_hugify(to_hugify);
|
|
assert(hpdata_alloc_allowed_get(to_hugify));
|
|
psset_update_end(&shard->psset, to_hugify);
|
|
/*
|
|
* Without lazy hugification, user relies on eagerly setting HG bit, or
|
|
* leaving everything up to the kernel (ex: thp enabled=always). We
|
|
* will still pretend that call succeeds to keep our accounting close to
|
|
* what user believes is the truth on the target system, but we won't
|
|
* update nhugifies stat as system call is not being made.
|
|
*/
|
|
if (hpa_is_hugify_lazy(shard) || opt_experimental_hpa_enforce_hugify) {
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
bool err = shard->central->hooks.hugify(
|
|
hpdata_addr_get(to_hugify), HUGEPAGE,
|
|
shard->opts.hugify_sync);
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
shard->stats.nhugifies++;
|
|
if (err) {
|
|
/*
|
|
* When asynchronous hugification is used
|
|
* (shard->opts.hugify_sync option is false), we are not
|
|
* expecting to get here, unless something went terrible
|
|
* wrong. Because underlying syscall is only setting
|
|
* kernel flag for memory range (actual hugification
|
|
* happens asynchronously and we are not getting any
|
|
* feedback about its outcome), we expect syscall to be
|
|
* successful all the time.
|
|
*/
|
|
shard->stats.nhugify_failures++;
|
|
}
|
|
}
|
|
|
|
psset_update_begin(&shard->psset, to_hugify);
|
|
hpdata_hugify(to_hugify);
|
|
hpdata_mid_hugify_set(to_hugify, false);
|
|
hpa_update_purge_hugify_eligibility(tsdn, shard, to_hugify);
|
|
psset_update_end(&shard->psset, to_hugify);
|
|
|
|
return true;
|
|
}
|
|
|
|
static bool
|
|
hpa_min_purge_interval_passed(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
uint64_t since_last_purge_ms = nstime_ms_between(
|
|
&shard->last_purge, &shard->last_time_work_attempted);
|
|
return since_last_purge_ms >= shard->opts.min_purge_interval_ms;
|
|
}
|
|
|
|
static inline void
|
|
hpa_update_time_work_attempted(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
shard->central->hooks.curtime(&shard->last_time_work_attempted,
|
|
/* first_reading */ false);
|
|
}
|
|
|
|
/*
|
|
* Execution of deferred work is forced if it's triggered by an explicit
|
|
* hpa_shard_do_deferred_work() call.
|
|
*/
|
|
static void
|
|
hpa_shard_maybe_do_deferred_work(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, bool forced) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
if (!forced && shard->opts.deferral_allowed) {
|
|
return;
|
|
}
|
|
hpa_update_time_work_attempted(tsdn, shard);
|
|
|
|
/*
|
|
* If we're on a background thread, do work so long as there's work to
|
|
* be done. Otherwise, bound latency to not be *too* bad by doing at
|
|
* most a small fixed number of operations.
|
|
*/
|
|
size_t max_ops = (forced ? (size_t)-1 : 16);
|
|
size_t nops = 0;
|
|
|
|
/*
|
|
* Always purge before hugifying, to make sure we get some
|
|
* ability to hit our quiescence targets.
|
|
*/
|
|
|
|
/*
|
|
* Make sure we respect purge interval setting and don't purge
|
|
* too frequently.
|
|
*/
|
|
if (hpa_min_purge_interval_passed(tsdn, shard)) {
|
|
size_t max_purges = max_ops;
|
|
/*
|
|
* Limit number of hugepages (slabs) to purge.
|
|
* When experimental_max_purge_nhp option is used, there is no
|
|
* guarantee we'll always respect dirty_mult option. Option
|
|
* experimental_max_purge_nhp provides a way to configure same
|
|
* behavior as was possible before, with buggy implementation
|
|
* of purging algorithm.
|
|
*/
|
|
ssize_t max_purge_nhp = shard->opts.experimental_max_purge_nhp;
|
|
if (max_purge_nhp != -1 && max_purges > (size_t)max_purge_nhp) {
|
|
max_purges = max_purge_nhp;
|
|
}
|
|
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
nops += hpa_purge(tsdn, shard, max_purges);
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
}
|
|
|
|
/*
|
|
* Try to hugify at least once, even if we out of operations to make at
|
|
* least some progress on hugification even at worst case.
|
|
*/
|
|
while (hpa_try_hugify(tsdn, shard) && nops < max_ops) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
nops++;
|
|
}
|
|
}
|
|
|
|
static edata_t *
|
|
hpa_try_alloc_one_offset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
|
|
hpdata_t *ps, hpdata_alloc_offset_t *alloc_offset, bool *oom) {
|
|
assert(*oom == false);
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
|
|
if (edata == NULL) {
|
|
*oom = true;
|
|
return NULL;
|
|
}
|
|
|
|
void *addr = hpdata_reserve_alloc_offset(ps, size, alloc_offset);
|
|
JE_USDT(hpa_alloc, 5, shard->ind, addr, size, hpdata_nactive_get(ps),
|
|
hpdata_age_get(ps));
|
|
edata_init(edata, shard->ind, addr, size, /* slab */ false, SC_NSIZES,
|
|
/* sn */ hpdata_age_get(ps), extent_state_active,
|
|
/* zeroed */ false, /* committed */ true, EXTENT_PAI_HPA,
|
|
EXTENT_NOT_HEAD);
|
|
edata_ps_set(edata, ps);
|
|
|
|
/*
|
|
* This could theoretically be moved outside of the critical section,
|
|
* but that introduces the potential for a race. Without the lock, the
|
|
* (initially nonempty, since this is the reuse pathway) pageslab we
|
|
* allocated out of could become otherwise empty while the lock is
|
|
* dropped. This would force us to deal with a pageslab eviction down
|
|
* the error pathway, which is a pain.
|
|
*/
|
|
const bool err = emap_register_boundary(
|
|
tsdn, shard->emap, edata, SC_NSIZES, /* slab */ false);
|
|
if (err) {
|
|
hpdata_unreserve(
|
|
ps, edata_addr_get(edata), edata_size_get(edata));
|
|
JE_USDT(hpa_dalloc_err, 5, shard->ind, edata_addr_get(edata),
|
|
edata_size_get(edata), hpdata_nactive_get(ps),
|
|
hpdata_age_get(ps));
|
|
/*
|
|
* We should arguably reset dirty state here, but this would
|
|
* require some sort of prepare + commit functionality that's a
|
|
* little much to deal with for now.
|
|
*
|
|
* We don't have a do_deferred_work down this pathway, on the
|
|
* principle that we didn't *really* affect shard state (we
|
|
* tweaked the stats, but our tweaks weren't really accurate).
|
|
*/
|
|
edata_cache_fast_put(tsdn, &shard->ecf, edata);
|
|
*oom = true;
|
|
return NULL;
|
|
}
|
|
|
|
return edata;
|
|
}
|
|
|
|
static size_t
|
|
hpa_try_alloc_from_one_ps(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
|
|
size_t max_nallocs, bool *oom, edata_list_active_t *results,
|
|
bool *deferred_work_generated) {
|
|
assert(size <= HUGEPAGE);
|
|
assert(size <= shard->opts.slab_max_alloc || size == sz_s2u(size));
|
|
assert(*oom == false);
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
nstime_t start;
|
|
nstime_init_update(&start);
|
|
|
|
hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
|
|
if (ps == NULL) {
|
|
return 0;
|
|
}
|
|
|
|
assert(max_nallocs <= SEC_MAX_NALLOCS);
|
|
hpdata_alloc_offset_t alloc_offsets[SEC_MAX_NALLOCS];
|
|
const size_t nallocs = hpdata_find_alloc_offsets(
|
|
ps, size, alloc_offsets, max_nallocs);
|
|
|
|
psset_update_begin(&shard->psset, ps);
|
|
|
|
if (hpdata_empty(ps)) {
|
|
/*
|
|
* If the pageslab used to be empty, treat it as though it's
|
|
* brand new for fragmentation-avoidance purposes; what we're
|
|
* trying to approximate is the age of the allocations *in* that
|
|
* pageslab, and the allocations in the new pageslab are by
|
|
* definition the youngest in this hpa shard.
|
|
*/
|
|
hpdata_age_set(ps, shard->age_counter++);
|
|
}
|
|
|
|
size_t nsuccess = 0;
|
|
for (; nsuccess < nallocs; nsuccess += 1) {
|
|
edata_t *edata = hpa_try_alloc_one_offset(
|
|
tsdn, shard, size, ps, (alloc_offsets + nsuccess), oom);
|
|
if (edata == NULL) {
|
|
break;
|
|
}
|
|
|
|
edata_list_active_append(results, edata);
|
|
}
|
|
|
|
hpdata_post_reserve_alloc_offsets(ps, size, alloc_offsets, nsuccess);
|
|
hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
|
|
psset_update_end(&shard->psset, ps);
|
|
|
|
const uint64_t elapsed_ns = nstime_ns_since(&start);
|
|
assert(nsuccess <= SEC_MAX_NALLOCS);
|
|
shard->stats.hpa_alloc_pages_per_ps[nsuccess] += nsuccess
|
|
* (size >> LG_PAGE);
|
|
shard->stats.hpa_alloc_extents_per_ps[nsuccess] += 1;
|
|
shard->stats.hpa_alloc_total_elapsed_ns_per_ps[nsuccess] += elapsed_ns;
|
|
|
|
return nsuccess;
|
|
}
|
|
|
|
static size_t
|
|
hpa_try_alloc_batch_no_grow_locked(tsdn_t *tsdn, hpa_shard_t *shard,
|
|
size_t size, size_t min_nallocs, size_t max_nallocs,
|
|
bool update_min_max_stats, bool *oom, edata_list_active_t *results,
|
|
bool *deferred_work_generated) {
|
|
assert(*oom == false);
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
/*
|
|
* As we require the shard mtx lock to update the stats,
|
|
* we do the update the first time this function is called from
|
|
* hpa_alloc_batch_psset().
|
|
*/
|
|
if (update_min_max_stats) {
|
|
assert(min_nallocs <= SEC_MAX_NALLOCS);
|
|
shard->stats.hpa_alloc_min_extents[min_nallocs] += 1;
|
|
assert(max_nallocs <= SEC_MAX_NALLOCS);
|
|
shard->stats.hpa_alloc_max_extents[max_nallocs] += 1;
|
|
}
|
|
|
|
size_t nsuccess = 0;
|
|
size_t ps_count = 0;
|
|
while (true) {
|
|
assert(1 <= min_nallocs);
|
|
assert(nsuccess < min_nallocs);
|
|
assert(min_nallocs <= max_nallocs);
|
|
const size_t nallocs = hpa_try_alloc_from_one_ps(tsdn, shard,
|
|
size, max_nallocs - nsuccess, oom, results,
|
|
deferred_work_generated);
|
|
if (nallocs == 0 || *oom) {
|
|
break;
|
|
}
|
|
nsuccess += nallocs;
|
|
ps_count += 1;
|
|
if (min_nallocs <= nsuccess) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
assert(nsuccess <= SEC_MAX_NALLOCS);
|
|
shard->stats.hpa_alloc_extents[nsuccess] += 1;
|
|
assert(ps_count <= SEC_MAX_NALLOCS);
|
|
shard->stats.hpa_alloc_ps[ps_count] += 1;
|
|
|
|
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
|
|
*deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
|
|
return nsuccess;
|
|
}
|
|
|
|
static size_t
|
|
hpa_try_alloc_batch_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
|
|
size_t min_nallocs, size_t max_nallocs, bool update_min_max_stats,
|
|
bool *oom, edata_list_active_t *results, bool *deferred_work_generated) {
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
const size_t nsuccess = hpa_try_alloc_batch_no_grow_locked(tsdn, shard,
|
|
size, min_nallocs, max_nallocs, update_min_max_stats, oom, results,
|
|
deferred_work_generated);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
return nsuccess;
|
|
}
|
|
|
|
static size_t
|
|
hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
|
|
size_t min_nallocs, size_t max_nallocs, edata_list_active_t *results,
|
|
bool *deferred_work_generated) {
|
|
bool oom = false;
|
|
|
|
size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size,
|
|
min_nallocs, max_nallocs, /* update_min_max_stats */ true, &oom,
|
|
results, deferred_work_generated);
|
|
if (min_nallocs <= nsuccess || oom) {
|
|
return nsuccess;
|
|
}
|
|
|
|
/*
|
|
* We didn't OOM, but weren't able to fill everything requested of us;
|
|
* try to grow.
|
|
*/
|
|
malloc_mutex_lock(tsdn, &shard->grow_mtx);
|
|
|
|
/*
|
|
* Check for grow races; maybe some earlier thread expanded the psset
|
|
* in between when we dropped the main mutex and grabbed the grow mutex.
|
|
*/
|
|
assert(nsuccess < min_nallocs);
|
|
assert(min_nallocs <= max_nallocs);
|
|
nsuccess += hpa_try_alloc_batch_no_grow(tsdn, shard, size,
|
|
min_nallocs - nsuccess, max_nallocs - nsuccess,
|
|
/* update_min_max_stats */ false, &oom, results,
|
|
deferred_work_generated);
|
|
if (min_nallocs <= nsuccess || oom) {
|
|
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
|
|
return nsuccess;
|
|
}
|
|
|
|
/*
|
|
* Note that we don't hold shard->mtx here (while growing);
|
|
* deallocations (and allocations of smaller sizes) may still succeed
|
|
* while we're doing this potentially expensive system call.
|
|
*/
|
|
hpdata_t *ps = hpa_central_extract(tsdn, shard->central, size,
|
|
shard->age_counter++, hpa_is_hugify_eager(shard), &oom);
|
|
if (ps == NULL) {
|
|
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
|
|
return nsuccess;
|
|
}
|
|
|
|
/*
|
|
* We got the pageslab; allocate from it. This holds the grow mutex
|
|
* while doing deferred work, but this is an uncommon path; the
|
|
* simplicity is worth it.
|
|
*/
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
psset_insert(&shard->psset, ps);
|
|
assert(nsuccess < min_nallocs);
|
|
assert(min_nallocs <= max_nallocs);
|
|
nsuccess += hpa_try_alloc_batch_no_grow_locked(tsdn, shard, size,
|
|
min_nallocs - nsuccess, max_nallocs - nsuccess,
|
|
/* update_min_max_stats */ false, &oom, results,
|
|
deferred_work_generated);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
|
|
malloc_mutex_unlock(tsdn, &shard->grow_mtx);
|
|
|
|
return nsuccess;
|
|
}
|
|
|
|
static void
|
|
hpa_assert_results(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, edata_list_active_t *results) {
|
|
/*
|
|
* Guard the sanity checks with config_debug because the loop cannot be
|
|
* proven non-circular by the compiler, even if everything within the
|
|
* loop is optimized away.
|
|
*/
|
|
if (config_debug) {
|
|
edata_t *edata;
|
|
ql_foreach (edata, &results->head, ql_link_active) {
|
|
emap_assert_mapped(tsdn, shard->emap, edata);
|
|
assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
|
|
assert(edata_state_get(edata) == extent_state_active);
|
|
assert(edata_arena_ind_get(edata) == shard->ind);
|
|
assert(
|
|
edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
|
|
assert(!edata_slab_get(edata));
|
|
assert(edata_committed_get(edata));
|
|
assert(edata_base_get(edata) == edata_addr_get(edata));
|
|
assert(edata_base_get(edata) != NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
edata_t *
|
|
hpa_alloc(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, size_t alignment,
|
|
bool zero, bool guarded, bool frequent_reuse,
|
|
bool *deferred_work_generated) {
|
|
assert((size & PAGE_MASK) == 0);
|
|
assert(!guarded);
|
|
witness_assert_depth_to_rank(
|
|
tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
|
|
|
|
/* We don't handle alignment or zeroing for now. */
|
|
if (alignment > PAGE || zero) {
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* frequent_use here indicates this request comes from the arena bins,
|
|
* in which case it will be split into slabs, and therefore there is no
|
|
* intrinsic slack in the allocation (the entire range of allocated size
|
|
* will be accessed).
|
|
*
|
|
* In this case bypass the slab_max_alloc limit (if still within the
|
|
* huge page size). These requests do not concern internal
|
|
* fragmentation with huge pages (again, the full size will be used).
|
|
*/
|
|
if (!(frequent_reuse && size <= HUGEPAGE)
|
|
&& (size > shard->opts.slab_max_alloc)) {
|
|
return NULL;
|
|
}
|
|
edata_t *edata = sec_alloc(tsdn, &shard->sec, size);
|
|
if (edata != NULL) {
|
|
return edata;
|
|
}
|
|
edata_list_active_t results;
|
|
edata_list_active_init(&results);
|
|
size_t min_nallocs, max_nallocs;
|
|
sec_calc_nallocs_for_size(
|
|
&shard->sec, size, &min_nallocs, &max_nallocs);
|
|
size_t nsuccess = hpa_alloc_batch_psset(tsdn, shard, size, min_nallocs,
|
|
max_nallocs, &results, deferred_work_generated);
|
|
hpa_assert_results(tsdn, shard, &results);
|
|
edata = edata_list_active_first(&results);
|
|
|
|
if (edata != NULL) {
|
|
edata_list_active_remove(&results, edata);
|
|
assert(nsuccess > 0);
|
|
nsuccess--;
|
|
}
|
|
if (nsuccess > 0) {
|
|
assert(sec_size_supported(&shard->sec, size));
|
|
sec_fill(tsdn, &shard->sec, size, &results, nsuccess);
|
|
/* Unlikely rollback in case of overfill */
|
|
if (!edata_list_active_empty(&results)) {
|
|
hpa_dalloc_batch(
|
|
tsdn, shard, &results, deferred_work_generated);
|
|
}
|
|
}
|
|
witness_assert_depth_to_rank(
|
|
tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
|
|
return edata;
|
|
}
|
|
|
|
bool
|
|
hpa_expand(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata, size_t old_size,
|
|
size_t new_size, bool zero, bool *deferred_work_generated) {
|
|
/* Expand not yet supported. */
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
hpa_shrink(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata, size_t old_size,
|
|
size_t new_size, bool *deferred_work_generated) {
|
|
/* Shrink not yet supported. */
|
|
return true;
|
|
}
|
|
|
|
static void
|
|
hpa_dalloc_prepare_unlocked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
|
|
malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
|
|
|
|
assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
|
|
assert(edata_state_get(edata) == extent_state_active);
|
|
assert(edata_arena_ind_get(edata) == shard->ind);
|
|
assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
|
|
assert(edata_committed_get(edata));
|
|
assert(edata_base_get(edata) != NULL);
|
|
|
|
/*
|
|
* Another thread shouldn't be trying to touch the metadata of an
|
|
* allocation being freed. The one exception is a merge attempt from a
|
|
* lower-addressed PAC extent; in this case we have a nominal race on
|
|
* the edata metadata bits, but in practice the fact that the PAI bits
|
|
* are different will prevent any further access. The race is bad, but
|
|
* benign in practice, and the long term plan is to track enough state
|
|
* in the rtree to prevent these merge attempts in the first place.
|
|
*/
|
|
edata_addr_set(edata, edata_base_get(edata));
|
|
edata_zeroed_set(edata, false);
|
|
emap_deregister_boundary(tsdn, shard->emap, edata);
|
|
}
|
|
|
|
static void
|
|
hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
|
|
/*
|
|
* Release the metadata early, to avoid having to remember to do it
|
|
* while we're also doing tricky purging logic. First, we need to grab
|
|
* a few bits of metadata from it.
|
|
*
|
|
* Note that the shard mutex protects ps's metadata too; it wouldn't be
|
|
* correct to try to read most information out of it without the lock.
|
|
*/
|
|
hpdata_t *ps = edata_ps_get(edata);
|
|
/* Currently, all edatas come from pageslabs. */
|
|
assert(ps != NULL);
|
|
void *unreserve_addr = edata_addr_get(edata);
|
|
size_t unreserve_size = edata_size_get(edata);
|
|
edata_cache_fast_put(tsdn, &shard->ecf, edata);
|
|
|
|
psset_update_begin(&shard->psset, ps);
|
|
hpdata_unreserve(ps, unreserve_addr, unreserve_size);
|
|
JE_USDT(hpa_dalloc, 5, shard->ind, unreserve_addr, unreserve_size,
|
|
hpdata_nactive_get(ps), hpdata_age_get(ps));
|
|
hpa_update_purge_hugify_eligibility(tsdn, shard, ps);
|
|
psset_update_end(&shard->psset, ps);
|
|
}
|
|
|
|
static void
|
|
hpa_dalloc_batch(tsdn_t *tsdn, hpa_shard_t *shard, edata_list_active_t *list,
|
|
bool *deferred_work_generated) {
|
|
edata_t *edata;
|
|
ql_foreach (edata, &list->head, ql_link_active) {
|
|
hpa_dalloc_prepare_unlocked(tsdn, shard, edata);
|
|
}
|
|
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
/* Now, remove from the list. */
|
|
while ((edata = edata_list_active_first(list)) != NULL) {
|
|
edata_list_active_remove(list, edata);
|
|
hpa_dalloc_locked(tsdn, shard, edata);
|
|
}
|
|
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ false);
|
|
*deferred_work_generated = hpa_shard_has_deferred_work(tsdn, shard);
|
|
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_dalloc(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata,
|
|
bool *deferred_work_generated) {
|
|
assert(!edata_guarded_get(edata));
|
|
|
|
edata_list_active_t dalloc_list;
|
|
edata_list_active_init(&dalloc_list);
|
|
edata_list_active_append(&dalloc_list, edata);
|
|
|
|
sec_dalloc(tsdn, &shard->sec, &dalloc_list);
|
|
if (edata_list_active_empty(&dalloc_list)) {
|
|
/* sec consumed the pointer */
|
|
*deferred_work_generated = false;
|
|
return;
|
|
}
|
|
/* We may have more than one pointer to flush now */
|
|
hpa_dalloc_batch(tsdn, shard, &dalloc_list, deferred_work_generated);
|
|
}
|
|
|
|
/*
|
|
* Calculate time until either purging or hugification ought to happen.
|
|
* Called by background threads.
|
|
*/
|
|
uint64_t
|
|
hpa_time_until_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
uint64_t time_ns = BACKGROUND_THREAD_DEFERRED_MAX;
|
|
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
|
|
hpdata_t *to_hugify = psset_pick_hugify(&shard->psset);
|
|
if (to_hugify != NULL) {
|
|
nstime_t time_hugify_allowed = hpdata_time_hugify_allowed(
|
|
to_hugify);
|
|
uint64_t since_hugify_allowed_ms =
|
|
shard->central->hooks.ms_since(&time_hugify_allowed);
|
|
/*
|
|
* If not enough time has passed since hugification was allowed,
|
|
* sleep for the rest.
|
|
*/
|
|
if (since_hugify_allowed_ms < shard->opts.hugify_delay_ms) {
|
|
time_ns = shard->opts.hugify_delay_ms
|
|
- since_hugify_allowed_ms;
|
|
time_ns *= 1000 * 1000;
|
|
} else {
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
return BACKGROUND_THREAD_DEFERRED_MIN;
|
|
}
|
|
}
|
|
|
|
if (hpa_should_purge(tsdn, shard)) {
|
|
/*
|
|
* If we haven't purged before, no need to check interval
|
|
* between purges. Simply purge as soon as possible.
|
|
*/
|
|
if (shard->stats.npurge_passes == 0) {
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
return BACKGROUND_THREAD_DEFERRED_MIN;
|
|
}
|
|
uint64_t since_last_purge_ms = shard->central->hooks.ms_since(
|
|
&shard->last_purge);
|
|
|
|
if (since_last_purge_ms < shard->opts.min_purge_interval_ms) {
|
|
uint64_t until_purge_ns;
|
|
until_purge_ns = shard->opts.min_purge_interval_ms
|
|
- since_last_purge_ms;
|
|
until_purge_ns *= 1000 * 1000;
|
|
|
|
if (until_purge_ns < time_ns) {
|
|
time_ns = until_purge_ns;
|
|
}
|
|
} else {
|
|
time_ns = BACKGROUND_THREAD_DEFERRED_MIN;
|
|
}
|
|
}
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
return time_ns;
|
|
}
|
|
|
|
static void
|
|
hpa_sec_flush_impl(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
edata_list_active_t to_flush;
|
|
edata_list_active_init(&to_flush);
|
|
|
|
sec_flush(tsdn, &shard->sec, &to_flush);
|
|
bool deferred_work_generated;
|
|
hpa_dalloc_batch(tsdn, shard, &to_flush, &deferred_work_generated);
|
|
}
|
|
|
|
void
|
|
hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
hpa_sec_flush_impl(tsdn, shard);
|
|
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
edata_cache_fast_disable(tsdn, &shard->ecf);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_sec_flush_impl(tsdn, shard);
|
|
}
|
|
|
|
static void
|
|
hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
|
|
assert(bin_stats->npageslabs == 0);
|
|
assert(bin_stats->nactive == 0);
|
|
}
|
|
|
|
static void
|
|
hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
|
|
malloc_mutex_assert_owner(tsdn, &shard->mtx);
|
|
for (int huge = 0; huge <= 1; huge++) {
|
|
hpa_shard_assert_stats_empty(&psset->stats.full_slabs[huge]);
|
|
for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
|
|
hpa_shard_assert_stats_empty(
|
|
&psset->stats.nonfull_slabs[i][huge]);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
hpa_shard_flush(tsdn, shard);
|
|
/*
|
|
* By the time we're here, the arena code should have dalloc'd all the
|
|
* active extents, which means we should have eventually evicted
|
|
* everything from the psset, so it shouldn't be able to serve even a
|
|
* 1-page allocation.
|
|
*/
|
|
if (config_debug) {
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
hpa_assert_empty(tsdn, shard, &shard->psset);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
}
|
|
hpdata_t *ps;
|
|
while ((ps = psset_pick_alloc(&shard->psset, PAGE)) != NULL) {
|
|
/* There should be no allocations anywhere. */
|
|
assert(hpdata_empty(ps));
|
|
psset_remove(&shard->psset, ps);
|
|
shard->central->hooks.unmap(hpdata_addr_get(ps), HUGEPAGE);
|
|
}
|
|
}
|
|
|
|
void
|
|
hpa_shard_set_deferral_allowed(
|
|
tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
bool deferral_previously_allowed = shard->opts.deferral_allowed;
|
|
shard->opts.deferral_allowed = deferral_allowed;
|
|
if (deferral_previously_allowed && !deferral_allowed) {
|
|
hpa_shard_maybe_do_deferred_work(tsdn, shard,
|
|
/* forced */ true);
|
|
}
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
malloc_mutex_lock(tsdn, &shard->mtx);
|
|
hpa_shard_maybe_do_deferred_work(tsdn, shard, /* forced */ true);
|
|
malloc_mutex_unlock(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
sec_prefork2(tsdn, &shard->sec);
|
|
}
|
|
|
|
void
|
|
hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
malloc_mutex_prefork(tsdn, &shard->grow_mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
malloc_mutex_prefork(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
sec_postfork_parent(tsdn, &shard->sec);
|
|
malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
|
|
malloc_mutex_postfork_parent(tsdn, &shard->mtx);
|
|
}
|
|
|
|
void
|
|
hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
|
|
hpa_do_consistency_checks(shard);
|
|
|
|
sec_postfork_child(tsdn, &shard->sec);
|
|
malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
|
|
malloc_mutex_postfork_child(tsdn, &shard->mtx);
|
|
}
|