Introduce pinned extents to contain unpurgeable pages

Some pages (e.g., hugetlb pages) cannot be purged, and should be
prioritized for reuse.  A custom extent_alloc hook signals this by
OR'ing EXTENT_ALLOC_FLAG_PINNED into the low bits of the returned
pointer; jemalloc strips the flag bits and caches pinned extents in
a dedicated ecache_pinned, separate from the dirty/muzzy decay
pipeline.

Pinned extents do not coalesce eagerly, except for ones larger than
SC_LARGE_MINCLASS.  A prefer-small policy reuses the smallest fitting
pinned extent, to avoid unnecessary split/fragmentation.
This commit is contained in:
Bin Liu 2026-04-19 22:56:22 -07:00 committed by Guangli Dai
parent 7638093c73
commit be2de8ccd8
22 changed files with 977 additions and 86 deletions

View file

@ -797,6 +797,8 @@ arena_prepare_base_deletion(tsd_t *tsd, base_t *base_to_destroy) {
tsd, &pac->ecache_muzzy.mtx, delayed_mtx, &n_delayed);
arena_prepare_base_deletion_sync(
tsd, &pac->ecache_retained.mtx, delayed_mtx, &n_delayed);
arena_prepare_base_deletion_sync(
tsd, &pac->ecache_pinned.mtx, delayed_mtx, &n_delayed);
}
arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n_delayed);
}

View file

@ -52,8 +52,9 @@ base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) {
if (ehooks_are_default(ehooks)) {
addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit);
} else {
addr = ehooks_alloc(
tsdn, ehooks, NULL, size, alignment, &zero, &commit);
UNUSED unsigned flags;
addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero,
&commit, &flags);
}
return addr;

View file

@ -257,9 +257,11 @@ INDEX_PROTO(stats_arenas_i_lextents_j)
CTL_PROTO(stats_arenas_i_extents_j_ndirty)
CTL_PROTO(stats_arenas_i_extents_j_nmuzzy)
CTL_PROTO(stats_arenas_i_extents_j_nretained)
CTL_PROTO(stats_arenas_i_extents_j_npinned)
CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes)
CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes)
CTL_PROTO(stats_arenas_i_extents_j_retained_bytes)
CTL_PROTO(stats_arenas_i_extents_j_pinned_bytes)
INDEX_PROTO(stats_arenas_i_extents_j)
/* Merged set of stats for HPA shard. */
@ -320,6 +322,7 @@ CTL_PROTO(stats_arenas_i_pdirty)
CTL_PROTO(stats_arenas_i_pmuzzy)
CTL_PROTO(stats_arenas_i_mapped)
CTL_PROTO(stats_arenas_i_retained)
CTL_PROTO(stats_arenas_i_pinned)
CTL_PROTO(stats_arenas_i_extent_avail)
CTL_PROTO(stats_arenas_i_dirty_npurge)
CTL_PROTO(stats_arenas_i_dirty_nmadvise)
@ -355,6 +358,7 @@ CTL_PROTO(stats_metadata_thp)
CTL_PROTO(stats_resident)
CTL_PROTO(stats_mapped)
CTL_PROTO(stats_retained)
CTL_PROTO(stats_pinned)
CTL_PROTO(stats_zero_reallocs)
CTL_PROTO(approximate_stats_active)
CTL_PROTO(experimental_hooks_install)
@ -697,9 +701,11 @@ static const ctl_named_node_t stats_arenas_i_extents_j_node[] = {
{NAME("ndirty"), CTL(stats_arenas_i_extents_j_ndirty)},
{NAME("nmuzzy"), CTL(stats_arenas_i_extents_j_nmuzzy)},
{NAME("nretained"), CTL(stats_arenas_i_extents_j_nretained)},
{NAME("npinned"), CTL(stats_arenas_i_extents_j_npinned)},
{NAME("dirty_bytes"), CTL(stats_arenas_i_extents_j_dirty_bytes)},
{NAME("muzzy_bytes"), CTL(stats_arenas_i_extents_j_muzzy_bytes)},
{NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}};
{NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)},
{NAME("pinned_bytes"), CTL(stats_arenas_i_extents_j_pinned_bytes)}};
static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = {
{NAME(""), CHILD(named, stats_arenas_i_extents_j)}};
@ -807,6 +813,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = {
{NAME("pmuzzy"), CTL(stats_arenas_i_pmuzzy)},
{NAME("mapped"), CTL(stats_arenas_i_mapped)},
{NAME("retained"), CTL(stats_arenas_i_retained)},
{NAME("pinned"), CTL(stats_arenas_i_pinned)},
{NAME("extent_avail"), CTL(stats_arenas_i_extent_avail)},
{NAME("dirty_npurge"), CTL(stats_arenas_i_dirty_npurge)},
{NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)},
@ -872,6 +879,7 @@ static const ctl_named_node_t stats_node[] = {
{NAME("resident"), CTL(stats_resident)},
{NAME("mapped"), CTL(stats_mapped)},
{NAME("retained"), CTL(stats_retained)},
{NAME("pinned"), CTL(stats_pinned)},
{NAME("background_thread"), CHILD(named, stats_background_thread)},
{NAME("mutexes"), CHILD(named, stats_mutexes)},
{NAME("arenas"), CHILD(indexed, stats_arenas)},
@ -1111,6 +1119,8 @@ ctl_arena_stats_sdmerge(
sdstats->astats.mapped += astats->astats.mapped;
sdstats->astats.pa_shard_stats.pac_stats.retained +=
astats->astats.pa_shard_stats.pac_stats.retained;
sdstats->astats.pa_shard_stats.pac_stats.pinned +=
astats->astats.pa_shard_stats.pac_stats.pinned;
sdstats->astats.pa_shard_stats.edata_avail +=
astats->astats.pa_shard_stats.edata_avail;
}
@ -1247,12 +1257,16 @@ ctl_arena_stats_sdmerge(
sdstats->estats[i].nmuzzy += astats->estats[i].nmuzzy;
sdstats->estats[i].nretained +=
astats->estats[i].nretained;
sdstats->estats[i].npinned +=
astats->estats[i].npinned;
sdstats->estats[i].dirty_bytes +=
astats->estats[i].dirty_bytes;
sdstats->estats[i].muzzy_bytes +=
astats->estats[i].muzzy_bytes;
sdstats->estats[i].retained_bytes +=
astats->estats[i].retained_bytes;
sdstats->estats[i].pinned_bytes +=
astats->estats[i].pinned_bytes;
}
/* Merge HPA stats. */
@ -1367,6 +1381,8 @@ ctl_refresh(tsdn_t *tsdn) {
ctl_stats->mapped = ctl_sarena->astats->astats.mapped;
ctl_stats->retained = ctl_sarena->astats->astats.pa_shard_stats
.pac_stats.retained;
ctl_stats->pinned = ctl_sarena->astats->astats.pa_shard_stats
.pac_stats.pinned;
ctl_background_thread_stats_read(tsdn);
@ -3721,6 +3737,7 @@ CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t)
CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t)
CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t)
CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t)
CTL_RO_CGEN(config_stats, stats_pinned, ctl_stats->pinned, size_t)
CTL_RO_CGEN(config_stats, stats_background_thread_num_threads,
ctl_stats->background_thread.num_threads, size_t)
@ -3786,6 +3803,8 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped,
arenas_i(mib[2])->astats->astats.mapped, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_retained,
arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.retained, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_pinned,
arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pinned, size_t)
CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail,
arenas_i(mib[2])->astats->astats.pa_shard_stats.edata_avail, size_t)
@ -3958,6 +3977,7 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen,
MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_dirty.mtx);
MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_muzzy.mtx);
MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_retained.mtx);
MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_pinned.mtx);
MUTEX_PROF_RESET(arena->pa_shard.pac.decay_dirty.mtx);
MUTEX_PROF_RESET(arena->pa_shard.pac.decay_muzzy.mtx);
MUTEX_PROF_RESET(arena->tcache_ql_mtx);
@ -4034,12 +4054,16 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy,
arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained,
arenas_i(mib[2])->astats->estats[mib[4]].nretained, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_npinned,
arenas_i(mib[2])->astats->estats[mib[4]].npinned, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes,
arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes,
arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes,
arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes, size_t);
CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_pinned_bytes,
arenas_i(mib[2])->astats->estats[mib[4]].pinned_bytes, size_t);
static const ctl_named_node_t *
stats_arenas_i_extents_j_index(

View file

@ -50,7 +50,8 @@ emap_try_acquire_edata_neighbor_impl(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
assert(!edata_state_in_transition(expected_state));
assert(expected_state == extent_state_dirty
|| expected_state == extent_state_muzzy
|| expected_state == extent_state_retained);
|| expected_state == extent_state_retained
|| expected_state == extent_state_pinned);
void *neighbor_addr = forward ? edata_past_get(edata)
: edata_before_get(edata);

View file

@ -98,7 +98,10 @@ eset_insert(eset_t *eset, edata_t *edata) {
eset_stats_add(eset, pind, size);
}
edata_list_inactive_append(&eset->lru, edata);
/* Pinned extents skip LRU as they do not decay. */
if (!edata_pinned_get(edata)) {
edata_list_inactive_append(&eset->lru, edata);
}
size_t npages = size >> LG_PAGE;
/*
* All modifications to npages hold the mutex (as asserted above), so we
@ -143,7 +146,9 @@ eset_remove(eset_t *eset, edata_t *edata) {
edata_heap_first(&eset->bins[pind].heap));
}
}
edata_list_inactive_remove(&eset->lru, edata);
if (!edata_pinned_get(edata)) {
edata_list_inactive_remove(&eset->lru, edata);
}
size_t npages = size >> LG_PAGE;
/*
* As in eset_insert, we hold eset->mtx and so don't need atomic
@ -279,10 +284,15 @@ eset_fit_alignment(
* avoiding reusing and splitting large extents for smaller sizes. In practice,
* it's set to opt_lg_extent_max_active_fit for the dirty eset and SC_PTR_BITS
* for others.
*
* If prefer_small is true, return as soon as the smallest fitting bin yields a
* candidate, instead of scanning further bins for an older/lower extent.
* Useful for fragmentation control for the pinned pool.
*/
static edata_t *
eset_first_fit(
eset_t *eset, size_t size, bool exact_only, unsigned lg_max_fit) {
eset_t *eset, size_t size, bool exact_only, unsigned lg_max_fit,
bool prefer_small) {
edata_t *ret = NULL;
edata_cmp_summary_t ret_summ JEMALLOC_CC_SILENCE_INIT({0});
@ -327,6 +337,9 @@ eset_first_fit(
if (sz_large_size_classes_disabled() && pind != pind_prev) {
ret = eset_enumerate_search(eset, size, pind_prev,
/* exact_only */ false, &ret_summ);
if (prefer_small && ret != NULL) {
return ret;
}
}
for (pszind_t i =
@ -363,6 +376,9 @@ eset_first_fit(
edata_cmp_summary_get(edata))
== 0);
ret = edata;
if (prefer_small) {
return ret;
}
ret_summ = eset->bins[i].heap_min;
}
if (i == SC_NPSIZES) {
@ -376,14 +392,15 @@ eset_first_fit(
edata_t *
eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only,
unsigned lg_max_fit) {
unsigned lg_max_fit, bool prefer_small) {
size_t max_size = esize + PAGE_CEILING(alignment) - PAGE;
/* Beware size_t wrap-around. */
if (max_size < esize) {
return NULL;
}
edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit);
edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit,
prefer_small);
if (alignment > PAGE && edata == NULL) {
/*

View file

@ -70,6 +70,7 @@ extent_may_force_decay(pac_t *pac) {
static bool
extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
ecache_t *ecache, edata_t *edata) {
malloc_mutex_assert_owner(tsdn, &ecache->mtx);
emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active);
bool coalesced;
@ -212,6 +213,7 @@ ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
switch (ecache->state) {
case extent_state_dirty:
case extent_state_muzzy:
case extent_state_pinned:
emap_update_edata_state(
tsdn, pac->emap, edata, extent_state_active);
break;
@ -244,7 +246,11 @@ extents_abandon_vm(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
}
/*
* Leak extent after making sure its pages have already been purged, so
* that this is only a virtual memory leak.
* that this is only a virtual memory leak, except when the extent is
* pinned/unpurgeable, for which a real memory leak happens. This is
* acceptable because reaching this path requires that an extent split
* fail, which is already an exceptional condition (typically an OOM
* on edata_t allocation).
*/
if (ecache->state == extent_state_dirty) {
if (extent_purge_lazy_impl(
@ -434,8 +440,12 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
* then no longer satify a request for its original size. To
* limit this effect, when delayed coalescing is enabled, we
* put a cap on how big an extent we can split for a request.
*
* Pinned extents are exempt: they are never purged, so the cap
* doesn't apply.
*/
unsigned lg_max_fit = ecache->delay_coalesce
unsigned lg_max_fit = (ecache->delay_coalesce
&& ecache != &pac->ecache_pinned)
? (unsigned)opt_lg_extent_max_active_fit
: SC_PTR_BITS;
@ -448,7 +458,13 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
* allocations.
*/
bool exact_only = (!maps_coalesce && !opt_retain) || guarded;
edata = eset_fit(eset, size, alignment, exact_only, lg_max_fit);
/*
* When selecting a pinned extent, avoid breaking larger extent
* if a smaller one works.
*/
bool prefer_small = (ecache == &pac->ecache_pinned);
edata = eset_fit(eset, size, alignment, exact_only, lg_max_fit,
prefer_small);
}
if (edata == NULL) {
return NULL;
@ -733,10 +749,9 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
}
bool zeroed = false;
bool committed = false;
void *ptr = ehooks_alloc(
tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed, &committed);
unsigned flags = 0;
void *ptr = ehooks_alloc(tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed,
&committed, &flags);
if (ptr == NULL) {
edata_cache_put(tsdn, pac->edata_cache, edata);
goto label_err;
@ -746,6 +761,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
edata_init(edata, ind, ptr, alloc_size, false, SC_NSIZES,
extent_sn_next(pac), extent_state_active, zeroed, committed,
EXTENT_PAI_PAC, EXTENT_IS_HEAD);
edata_hook_flags_init(edata, flags);
if (flags & EXTENT_ALLOC_FLAG_PINNED) {
atomic_store_b(&pac->has_pinned, true, ATOMIC_RELAXED);
}
if (extent_register_no_gdump_add(tsdn, pac, edata)) {
edata_cache_put(tsdn, pac->edata_cache, edata);
@ -767,12 +786,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
if (result == extent_split_interior_ok) {
if (lead != NULL) {
extent_record(
tsdn, pac, ehooks, &pac->ecache_retained, lead);
pac_record_grown(tsdn, pac, ehooks, lead);
}
if (trail != NULL) {
extent_record(
tsdn, pac, ehooks, &pac->ecache_retained, trail);
pac_record_grown(tsdn, pac, ehooks, trail);
}
} else {
/*
@ -784,8 +801,7 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
if (config_prof) {
extent_gdump_add(tsdn, to_salvage);
}
extent_record(tsdn, pac, ehooks, &pac->ecache_retained,
to_salvage);
pac_record_grown(tsdn, pac, ehooks, to_salvage);
}
if (to_leak != NULL) {
extent_deregister_no_gdump_sub(tsdn, pac, to_leak);
@ -796,6 +812,8 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
}
if (*commit && !edata_committed_get(edata)) {
/* Pinned memory must be committed by the hook. */
assert(!edata_pinned_get(edata));
if (extent_commit_impl(
tsdn, ehooks, edata, 0, edata_size_get(edata), true)) {
extent_record(
@ -815,10 +833,13 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
/*
* Increment extent_grow_next if doing so wouldn't exceed the allowed
* range.
* range. Skip for pinned: pinned memory is a finite resource;
* oversized remnants waste it.
*/
/* All opportunities for failure are past. */
exp_grow_size_commit(&pac->exp_grow, exp_grow_skip);
if (!(flags & EXTENT_ALLOC_FLAG_PINNED)) {
exp_grow_size_commit(&pac->exp_grow, exp_grow_skip);
}
malloc_mutex_unlock(tsdn, &pac->grow_mtx);
if (huge_arena_pac_thp.thp_madvise) {
@ -1020,11 +1041,13 @@ extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
edata = extent_try_coalesce(
tsdn, pac, ehooks, ecache, edata, &coalesced_unused);
} else if (edata_size_get(edata) >= SC_LARGE_MINCLASS) {
assert(ecache == &pac->ecache_dirty);
assert(edata_pinned_get(edata)
? (ecache == &pac->ecache_pinned)
: (ecache == &pac->ecache_dirty));
/* Always coalesce large extents eagerly. */
/**
* Maximum size limit (max_size) for large extents waiting to be coalesced
* in dirty ecache.
* in pinned/dirty ecache.
*
* When set to a non-zero value, this parameter restricts the maximum size
* of large extents after coalescing. If the combined size of two extents
@ -1056,7 +1079,9 @@ extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
edata = extent_try_coalesce_large(tsdn, pac, ehooks,
ecache, edata, max_size, &coalesced);
} while (coalesced);
if (edata_size_get(edata) >= atomic_load_zu(
/* Pinned extents cannot be purged; skip the oversize shortcut. */
if (ecache == &pac->ecache_dirty
&& edata_size_get(edata) >= atomic_load_zu(
&pac->oversize_threshold, ATOMIC_RELAXED)
&& !background_thread_enabled()
&& extent_may_force_decay(pac)) {
@ -1119,8 +1144,9 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, void *new_addr,
return NULL;
}
size_t palignment = ALIGNMENT_CEILING(alignment, PAGE);
void *addr = ehooks_alloc(
tsdn, ehooks, new_addr, size, palignment, &zero, commit);
unsigned flags = 0;
void *addr = ehooks_alloc(tsdn, ehooks, new_addr, size, palignment,
&zero, commit, &flags);
if (addr == NULL) {
edata_cache_put(tsdn, pac->edata_cache, edata);
return NULL;
@ -1129,6 +1155,10 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, void *new_addr,
/* slab */ false, SC_NSIZES, extent_sn_next(pac),
extent_state_active, zero, *commit, EXTENT_PAI_PAC,
opt_retain ? EXTENT_IS_HEAD : EXTENT_NOT_HEAD);
edata_hook_flags_init(edata, flags);
if (flags & EXTENT_ALLOC_FLAG_PINNED) {
atomic_store_b(&pac->has_pinned, true, ATOMIC_RELAXED);
}
/*
* Retained memory is not counted towards gdump. Only if an extent is
* allocated as a separate mapping, i.e. growing_retained is false, then
@ -1328,6 +1358,7 @@ extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata,
/* slab */ false, SC_NSIZES, edata_sn_get(edata),
edata_state_get(edata), edata_zeroed_get(edata),
edata_committed_get(edata), EXTENT_PAI_PAC, EXTENT_NOT_HEAD);
edata_hook_flags_init(trail, edata_alloc_flags_get(edata));
emap_prepare_t prepare;
bool err = emap_split_prepare(
tsdn, pac->emap, &prepare, edata, size_a, trail, size_b);
@ -1412,6 +1443,8 @@ extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a,
: edata_sn_get(b));
edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b));
assert(edata_pinned_get(a) == edata_pinned_get(b));
emap_merge_commit(tsdn, pac->emap, &prepare, a, b);
edata_cache_put(tsdn, pac->edata_cache, b);

View file

@ -34,6 +34,7 @@ pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) {
ecache_prefork(tsdn, &shard->pac.ecache_dirty);
ecache_prefork(tsdn, &shard->pac.ecache_muzzy);
ecache_prefork(tsdn, &shard->pac.ecache_retained);
ecache_prefork(tsdn, &shard->pac.ecache_pinned);
if (shard->ever_used_hpa) {
hpa_shard_prefork4(tsdn, &shard->hpa_shard);
}
@ -50,6 +51,7 @@ pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) {
ecache_postfork_parent(tsdn, &shard->pac.ecache_dirty);
ecache_postfork_parent(tsdn, &shard->pac.ecache_muzzy);
ecache_postfork_parent(tsdn, &shard->pac.ecache_retained);
ecache_postfork_parent(tsdn, &shard->pac.ecache_pinned);
malloc_mutex_postfork_parent(tsdn, &shard->pac.grow_mtx);
malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx);
malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx);
@ -64,6 +66,7 @@ pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) {
ecache_postfork_child(tsdn, &shard->pac.ecache_dirty);
ecache_postfork_child(tsdn, &shard->pac.ecache_muzzy);
ecache_postfork_child(tsdn, &shard->pac.ecache_retained);
ecache_postfork_child(tsdn, &shard->pac.ecache_pinned);
malloc_mutex_postfork_child(tsdn, &shard->pac.grow_mtx);
malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx);
malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx);
@ -107,12 +110,15 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
pa_shard_stats_out->pac_stats.retained +=
ecache_npages_get(&shard->pac.ecache_retained) << LG_PAGE;
pa_shard_stats_out->pac_stats.pinned +=
ecache_npages_get(&shard->pac.ecache_pinned) << LG_PAGE;
pa_shard_stats_out->edata_avail += atomic_load_zu(
&shard->edata_cache.count, ATOMIC_RELAXED);
size_t resident_pgs = 0;
resident_pgs += pa_shard_nactive(shard);
resident_pgs += pa_shard_ndirty(shard);
resident_pgs += ecache_npages_get(&shard->pac.ecache_pinned);
*resident += (resident_pgs << LG_PAGE);
/* Dirty decay stats */
@ -147,22 +153,27 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard,
atomic_load_zu(&shard->pac.stats->abandoned_vm, ATOMIC_RELAXED));
for (pszind_t i = 0; i < SC_NPSIZES; i++) {
size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes,
retained_bytes;
size_t dirty, muzzy, retained, pinned, dirty_bytes,
muzzy_bytes, retained_bytes, pinned_bytes;
dirty = ecache_nextents_get(&shard->pac.ecache_dirty, i);
muzzy = ecache_nextents_get(&shard->pac.ecache_muzzy, i);
retained = ecache_nextents_get(&shard->pac.ecache_retained, i);
pinned = ecache_nextents_get(&shard->pac.ecache_pinned, i);
dirty_bytes = ecache_nbytes_get(&shard->pac.ecache_dirty, i);
muzzy_bytes = ecache_nbytes_get(&shard->pac.ecache_muzzy, i);
retained_bytes = ecache_nbytes_get(
&shard->pac.ecache_retained, i);
pinned_bytes = ecache_nbytes_get(
&shard->pac.ecache_pinned, i);
estats_out[i].ndirty = dirty;
estats_out[i].nmuzzy = muzzy;
estats_out[i].nretained = retained;
estats_out[i].npinned = pinned;
estats_out[i].dirty_bytes = dirty_bytes;
estats_out[i].muzzy_bytes = muzzy_bytes;
estats_out[i].retained_bytes = retained_bytes;
estats_out[i].pinned_bytes = pinned_bytes;
}
if (shard->ever_used_hpa) {
@ -189,6 +200,8 @@ pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard,
&shard->pac.ecache_muzzy.mtx, arena_prof_mutex_extents_muzzy);
pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
&shard->pac.ecache_retained.mtx, arena_prof_mutex_extents_retained);
pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
&shard->pac.ecache_pinned.mtx, arena_prof_mutex_extents_pinned);
pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,
&shard->pac.decay_dirty.mtx, arena_prof_mutex_decay_dirty);
pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data,

160
src/pac.c
View file

@ -31,6 +31,7 @@ pac_decay_data_get(pac_t *pac, extent_state_t state, decay_t **r_decay,
return;
case extent_state_active:
case extent_state_retained:
case extent_state_pinned:
case extent_state_transition:
case extent_state_merging:
default:
@ -72,6 +73,12 @@ pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap,
/* delay_coalesce */ false)) {
return true;
}
/* Pinned extents: no decay, delayed coalesce. */
if (ecache_init(tsdn, &pac->ecache_pinned, extent_state_pinned, ind,
/* delay_coalesce */ true)) {
return true;
}
atomic_store_b(&pac->has_pinned, false, ATOMIC_RELAXED);
exp_grow_init(&pac->exp_grow);
if (malloc_mutex_init(&pac->grow_mtx, "extent_grow",
WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) {
@ -110,6 +117,14 @@ pac_may_have_muzzy(pac_t *pac) {
return pac_decay_ms_get(pac, extent_state_muzzy) != 0;
}
static inline void
pac_ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
edata_t *edata) {
ecache_dalloc(tsdn, pac, ehooks,
edata_pinned_get(edata) ? &pac->ecache_pinned : &pac->ecache_dirty,
edata);
}
static size_t
pac_alloc_retained_batched_size(size_t size) {
if (size > SC_LARGE_MAXCLASS) {
@ -133,8 +148,22 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
assert(!guarded || alignment <= PAGE);
size_t newly_mapped_size = 0;
edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty,
NULL, size, alignment, zero, guarded);
edata_t *edata = NULL;
/*
* Guarded allocations need surrounding guard pages, which the pinned
* pool does not maintain; skip ecache_pinned in that case.
*/
if (!guarded && atomic_load_b(&pac->has_pinned, ATOMIC_RELAXED)
&& ecache_npages_get(&pac->ecache_pinned) > 0) {
edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_pinned,
NULL, size, alignment, zero, guarded);
}
if (edata == NULL) {
edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty,
NULL, size, alignment, zero, guarded);
}
if (edata == NULL && pac_may_have_muzzy(pac)) {
edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy,
@ -180,12 +209,10 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size,
edata, size, batched_size - size,
/* holding_core_locks */ false);
if (trail == NULL) {
ecache_dalloc(tsdn, pac, ehooks,
&pac->ecache_retained, edata);
pac_record_grown(tsdn, pac, ehooks, edata);
edata = NULL;
} else {
ecache_dalloc(tsdn, pac, ehooks,
&pac->ecache_dirty, trail);
pac_ecache_dalloc(tsdn, pac, ehooks, trail);
}
}
@ -277,23 +304,53 @@ pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
if (ehooks_merge_will_fail(ehooks)) {
return true;
}
edata_t *trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty,
edata, expand_amount, PAGE, zero, /* guarded*/ false);
if (trail == NULL) {
trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy,
edata, expand_amount, PAGE, zero, /* guarded*/ false);
edata_t *trail = NULL;
if (edata_pinned_get(edata)) {
trail = ecache_alloc(tsdn, pac, ehooks,
&pac->ecache_pinned, edata, expand_amount,
PAGE, zero, /* guarded */ false);
if (trail == NULL) {
/*
* Only ecache_pinned can hold a mergeable neighbor;
* dirty, muzzy, and retained extents are non-pinned.
* Pinned memory is already committed, and hooks are
* unlikely to reserve adjacent pinned space for growth,
* so don't consult the hook to grow in place.
*/
return true;
}
assert(edata_pinned_get(trail));
} else {
trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty,
edata, expand_amount, PAGE, zero, /* guarded */ false);
if (trail == NULL) {
trail = ecache_alloc(tsdn, pac, ehooks,
&pac->ecache_muzzy, edata, expand_amount,
PAGE, zero, /* guarded */ false);
}
if (trail == NULL) {
trail = ecache_alloc_grow(tsdn, pac, ehooks,
&pac->ecache_retained, edata, expand_amount,
PAGE, zero, /* guarded */ false);
mapped_add = expand_amount;
}
if (trail == NULL) {
return true;
}
}
if (trail == NULL) {
trail = ecache_alloc_grow(tsdn, pac, ehooks,
&pac->ecache_retained, edata, expand_amount, PAGE, zero,
/* guarded */ false);
mapped_add = expand_amount;
}
if (trail == NULL) {
return true;
}
if (extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) {
extent_dalloc_wrapper(tsdn, pac, ehooks, trail);
/* extent_merge_wrapper requires matching pinnedness. */
if ((edata_pinned_get(edata) != edata_pinned_get(trail))
|| extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) {
if (edata_pinned_get(trail)) {
if (config_stats) {
atomic_fetch_add_zu(&pac->stats->pac_mapped,
mapped_add, ATOMIC_RELAXED);
}
ecache_dalloc(tsdn, pac, ehooks,
&pac->ecache_pinned, trail);
} else {
extent_dalloc_wrapper(tsdn, pac, ehooks, trail);
}
return true;
}
if (config_stats && mapped_add > 0) {
@ -320,8 +377,11 @@ pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size,
if (trail == NULL) {
return true;
}
ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, trail);
*deferred_work_generated = true;
bool pinned = edata_pinned_get(trail);
pac_ecache_dalloc(tsdn, pac, ehooks, trail);
if (!pinned) {
*deferred_work_generated = true;
}
return false;
}
@ -352,9 +412,11 @@ pac_dalloc_impl(
}
}
ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata);
/* Purging of deallocated pages is deferred */
*deferred_work_generated = true;
bool pinned = edata_pinned_get(edata);
pac_ecache_dalloc(tsdn, pac, ehooks, edata);
if (!pinned) {
*deferred_work_generated = true;
}
}
static inline uint64_t
@ -543,6 +605,7 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay,
break;
case extent_state_active:
case extent_state_retained:
case extent_state_pinned:
case extent_state_transition:
case extent_state_merging:
default:
@ -721,7 +784,48 @@ pac_destroy(tsdn_t *tsdn, pac_t *pac) {
* dss-based extents for later reuse.
*/
ehooks_t *ehooks = pac_ehooks_get(pac);
edata_t *edata;
edata_t *edata;
if (atomic_load_b(&pac->has_pinned, ATOMIC_RELAXED)) {
/*
* Reroute pinned extents through ecache_retained: clearing the
* pinned bit lets retained's eager coalesce merge fragments
* back to their original OS-allocation bases, so the destroy
* hook can release whole reservations (required on platforms
* like Windows where VirtualFree only accepts the original
* VirtualAlloc base). Subtract from pac_mapped along the way
* because retained is excluded from stats.mapped.
*/
edata_list_inactive_t pinned_list;
edata_list_inactive_init(&pinned_list);
malloc_mutex_lock(tsdn, &pac->ecache_pinned.mtx);
assert(eset_npages_get(&pac->ecache_pinned.guarded_eset) == 0);
size_t pinned_bytes =
eset_npages_get(&pac->ecache_pinned.eset) << LG_PAGE;
while (eset_npages_get(&pac->ecache_pinned.eset) > 0) {
edata = eset_fit(&pac->ecache_pinned.eset,
PAGE, PAGE, /* exact_only */ false, SC_PTR_BITS,
/* prefer_small */ false);
assert(edata != NULL);
assert(edata_pinned_get(edata));
eset_remove(&pac->ecache_pinned.eset, edata);
emap_update_edata_state(tsdn, pac->emap, edata,
extent_state_active);
edata_pinned_set(edata, false);
edata_list_inactive_append(&pinned_list, edata);
}
malloc_mutex_unlock(tsdn, &pac->ecache_pinned.mtx);
if (config_stats && pinned_bytes > 0) {
atomic_fetch_sub_zu(&pac->stats->pac_mapped,
pinned_bytes, ATOMIC_RELAXED);
}
while ((edata = edata_list_inactive_first(&pinned_list))
!= NULL) {
edata_list_inactive_remove(&pinned_list, edata);
extent_record(tsdn, pac, ehooks,
&pac->ecache_retained, edata);
}
}
assert(ecache_npages_get(&pac->ecache_pinned) == 0);
while (
(edata = ecache_evict(tsdn, pac, ehooks, &pac->ecache_retained, 0))
!= NULL) {

View file

@ -712,6 +712,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
COL_HDR(row, muzzy, NULL, right, 13, size)
COL_HDR(row, nretained, NULL, right, 13, size)
COL_HDR(row, retained, NULL, right, 13, size)
COL_HDR(row, npinned, NULL, right, 13, size)
COL_HDR(row, pinned, NULL, right, 13, size)
COL_HDR(row, ntotal, NULL, right, 13, size)
COL_HDR(row, total, NULL, right, 13, size)
@ -728,22 +730,27 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
in_gap = false;
for (j = 0; j < SC_NPSIZES; j++) {
size_t ndirty, nmuzzy, nretained, total, dirty_bytes,
muzzy_bytes, retained_bytes, total_bytes;
size_t ndirty, nmuzzy, nretained, npinned, total,
dirty_bytes, muzzy_bytes, retained_bytes, pinned_bytes,
total_bytes;
stats_arenas_mib[4] = j;
CTL_LEAF(stats_arenas_mib, 5, "ndirty", &ndirty, size_t);
CTL_LEAF(stats_arenas_mib, 5, "nmuzzy", &nmuzzy, size_t);
CTL_LEAF(stats_arenas_mib, 5, "nretained", &nretained, size_t);
CTL_LEAF(stats_arenas_mib, 5, "npinned", &npinned, size_t);
CTL_LEAF(
stats_arenas_mib, 5, "dirty_bytes", &dirty_bytes, size_t);
CTL_LEAF(
stats_arenas_mib, 5, "muzzy_bytes", &muzzy_bytes, size_t);
CTL_LEAF(stats_arenas_mib, 5, "retained_bytes", &retained_bytes,
size_t);
CTL_LEAF(stats_arenas_mib, 5, "pinned_bytes", &pinned_bytes,
size_t);
total = ndirty + nmuzzy + nretained;
total_bytes = dirty_bytes + muzzy_bytes + retained_bytes;
total = ndirty + nmuzzy + nretained + npinned;
total_bytes = dirty_bytes + muzzy_bytes + retained_bytes
+ pinned_bytes;
in_gap_prev = in_gap;
in_gap = (total == 0);
@ -758,6 +765,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy);
emitter_json_kv(
emitter, "nretained", emitter_type_size, &nretained);
emitter_json_kv(
emitter, "npinned", emitter_type_size, &npinned);
emitter_json_kv(
emitter, "dirty_bytes", emitter_type_size, &dirty_bytes);
@ -765,6 +774,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
emitter, "muzzy_bytes", emitter_type_size, &muzzy_bytes);
emitter_json_kv(emitter, "retained_bytes", emitter_type_size,
&retained_bytes);
emitter_json_kv(emitter, "pinned_bytes", emitter_type_size,
&pinned_bytes);
emitter_json_object_end(emitter);
col_size.size_val = sz_pind2sz(j);
@ -775,6 +786,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) {
col_muzzy.size_val = muzzy_bytes;
col_nretained.size_val = nretained;
col_retained.size_val = retained_bytes;
col_npinned.size_val = npinned;
col_pinned.size_val = pinned_bytes;
col_ntotal.size_val = total;
col_total.size_val = total_bytes;
@ -1166,7 +1179,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
unsigned nthreads;
const char *dss;
ssize_t dirty_decay_ms, muzzy_decay_ms;
size_t page, pactive, pdirty, pmuzzy, mapped, retained;
size_t page, pactive, pdirty, pmuzzy, mapped, retained, pinned;
size_t base, internal, resident, metadata_edata, metadata_rtree,
metadata_thp, extent_avail;
uint64_t dirty_npurge, dirty_nmadvise, dirty_purged;
@ -1467,6 +1480,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large,
GET_AND_EMIT_MEM_STAT(mapped)
GET_AND_EMIT_MEM_STAT(retained)
GET_AND_EMIT_MEM_STAT(pinned)
GET_AND_EMIT_MEM_STAT(base)
GET_AND_EMIT_MEM_STAT(internal)
GET_AND_EMIT_MEM_STAT(metadata_edata)
@ -1872,7 +1886,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
* the transition to the emitter code.
*/
size_t allocated, active, metadata, metadata_edata, metadata_rtree,
metadata_thp, resident, mapped, retained;
metadata_thp, resident, mapped, retained, pinned;
size_t num_background_threads;
size_t zero_reallocs;
uint64_t background_thread_num_runs, background_thread_run_interval;
@ -1886,6 +1900,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
CTL_GET("stats.resident", &resident, size_t);
CTL_GET("stats.mapped", &mapped, size_t);
CTL_GET("stats.retained", &retained, size_t);
CTL_GET("stats.pinned", &pinned, size_t);
CTL_GET("stats.zero_reallocs", &zero_reallocs, size_t);
@ -1916,15 +1931,16 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed,
emitter_json_kv(emitter, "resident", emitter_type_size, &resident);
emitter_json_kv(emitter, "mapped", emitter_type_size, &mapped);
emitter_json_kv(emitter, "retained", emitter_type_size, &retained);
emitter_json_kv(emitter, "pinned", emitter_type_size, &pinned);
emitter_json_kv(
emitter, "zero_reallocs", emitter_type_size, &zero_reallocs);
emitter_table_printf(emitter,
"Allocated: %zu, active: %zu, "
"metadata: %zu (n_thp %zu, edata %zu, rtree %zu), resident: %zu, "
"mapped: %zu, retained: %zu\n",
"mapped: %zu, retained: %zu, pinned: %zu\n",
allocated, active, metadata, metadata_thp, metadata_edata,
metadata_rtree, resident, mapped, retained);
metadata_rtree, resident, mapped, retained, pinned);
/* Strange behaviors */
emitter_table_printf(emitter,