From be2de8ccd8ba38d687647ed43d75dc0ee5846df7 Mon Sep 17 00:00:00 2001 From: Bin Liu Date: Sun, 19 Apr 2026 22:56:22 -0700 Subject: [PATCH] Introduce pinned extents to contain unpurgeable pages Some pages (e.g., hugetlb pages) cannot be purged, and should be prioritized for reuse. A custom extent_alloc hook signals this by OR'ing EXTENT_ALLOC_FLAG_PINNED into the low bits of the returned pointer; jemalloc strips the flag bits and caches pinned extents in a dedicated ecache_pinned, separate from the dirty/muzzy decay pipeline. Pinned extents do not coalesce eagerly, except for ones larger than SC_LARGE_MINCLASS. A prefer-small policy reuses the smallest fitting pinned extent, to avoid unnecessary split/fragmentation. --- Makefile.in | 1 + configure.ac | 3 + doc/jemalloc.xml.in | 86 +++- include/jemalloc/internal/ctl.h | 1 + include/jemalloc/internal/edata.h | 50 ++- include/jemalloc/internal/ehooks.h | 14 +- include/jemalloc/internal/emap.h | 1 + include/jemalloc/internal/eset.h | 2 +- include/jemalloc/internal/extent.h | 4 + include/jemalloc/internal/mutex_prof.h | 1 + include/jemalloc/internal/pac.h | 27 +- include/jemalloc/jemalloc_typedefs.h.in | 30 ++ src/arena.c | 2 + src/base.c | 5 +- src/ctl.c | 26 +- src/emap.c | 3 +- src/eset.c | 27 +- src/extent.c | 73 +++- src/pa_extra.c | 17 +- src/pac.c | 160 ++++++-- src/stats.c | 32 +- test/unit/extent_alloc_flags.c | 498 ++++++++++++++++++++++++ 22 files changed, 977 insertions(+), 86 deletions(-) create mode 100644 test/unit/extent_alloc_flags.c diff --git a/Makefile.in b/Makefile.in index 2a8c5b49..38320810 100644 --- a/Makefile.in +++ b/Makefile.in @@ -246,6 +246,7 @@ TESTS_UNIT := \ $(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \ $(srcroot)test/unit/hpa_background_thread.c \ $(srcroot)test/unit/hpdata.c \ + $(srcroot)test/unit/extent_alloc_flags.c \ $(srcroot)test/unit/huge.c \ $(srcroot)test/unit/inspect.c \ $(srcroot)test/unit/jemalloc_init.c \ diff --git a/configure.ac b/configure.ac index d1518298..75eea008 100644 --- a/configure.ac +++ b/configure.ac @@ -2062,6 +2062,9 @@ if test "x${LG_PAGE}" != "xundefined" ; then else AC_MSG_ERROR([cannot determine value for LG_PAGE]) fi +if test "${LG_PAGE}" -lt 8 ; then + AC_MSG_ERROR([LG_PAGE must be at least 8 (PAGE >= 256)]) +fi AC_ARG_WITH([lg_hugepage], [AS_HELP_STRING([--with-lg-hugepage=], diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in index 53f116af..d3e475b8 100644 --- a/doc/jemalloc.xml.in +++ b/doc/jemalloc.xml.in @@ -2172,6 +2172,15 @@ malloc_conf = "xmalloc:true";]]> addition there may be extents created prior to the application having an opportunity to take over extent allocation. + An extent must be operated on (dalloc, destroy, commit, decommit, + purge, split, merge) by a hook capable of handling it, normally the hook + that allocated it. Replacing hooks on a live arena is tricky and thus + discouraged. If the hook is replaced anyway, the new hook should forward + operations on extents it did not allocate to the previous hook (e.g., + the new dalloc dispatches to the previous dalloc for an + old-hook-allocated extent). The new hook should also avoid merging + extents allocated by different hooks. + arena.<i>.dss setting irrelevant. + The alloc hook may bitwise-OR + EXTENT_ALLOC_FLAG_PINNED into the low bits of + the returned pointer to indicate that the backing memory is + non-reclaimable (e.g. HugeTLB pages) and should be reused + preferentially; in that case *commit must also + be set to true. jemalloc strips the low byte before use. The + pinned attribute is per-extent rather than per-hook: a single alloc + hook may return pinned and non-pinned extents in different calls. + Pinned-ness is set at allocation, inherited through splits, and + never changes after that. Pinned and non-pinned extents are never + merged together. + typedef bool (extent_dalloc_t) extent_hooks_t *extent_hooks @@ -2768,11 +2789,11 @@ struct extent_hooks_s { Maximum number of bytes in physically resident data pages mapped by the allocator, comprising all pages dedicated to - allocator metadata, pages backing active allocations, and unused dirty - pages. This is a maximum rather than precise because pages may not - actually be physically resident if they correspond to demand-zeroed - virtual memory that has not yet been touched. This is a multiple of the - page size, and is larger than stats.active. @@ -2811,6 +2832,22 @@ struct extent_hooks_s { + + + stats.pinned + (size_t) + r- + [] + + Total number of bytes in unused extents backed by + non-reclaimable memory. Pinned extents are tracked separately from + dirty, muzzy, and retained extents because they are excluded from + decay and purging; unlike stats.retained, + pinned bytes are included in stats.mapped. + + stats.zero_reallocs @@ -3089,6 +3126,18 @@ struct extent_hooks_s { details. + + + stats.arenas.<i>.pinned + (size_t) + r- + [] + + Number of pinned bytes. See stats.pinned for + details. + + stats.arenas.<i>.extent_avail @@ -3146,11 +3195,11 @@ struct extent_hooks_s { Maximum number of bytes in physically resident data pages mapped by the arena, comprising all pages dedicated to allocator - metadata, pages backing active allocations, and unused dirty pages. - This is a maximum rather than precise because pages may not actually be - physically resident if they correspond to demand-zeroed virtual memory - that has not yet been touched. This is a multiple of the page - size. + metadata, pages backing active allocations, unused dirty pages, and + pinned pages. This is a maximum rather than precise because pages + may not actually be physically resident if they correspond to + demand-zeroed virtual memory that has not yet been touched. This is + a multiple of the page size. @@ -3493,7 +3542,7 @@ struct extent_hooks_s { Number of extents of the given type in this arena in the bucket corresponding to page size index <j>. The extent type - is one of dirty, muzzy, or retained. + is one of dirty, muzzy, retained, or pinned. @@ -3505,7 +3554,7 @@ struct extent_hooks_s { Sum of the bytes managed by extents of the given type in this arena in the bucket corresponding to page size index <j>. - The extent type is one of dirty, muzzy, or retained. + The extent type is one of dirty, muzzy, retained, or pinned. @@ -3625,6 +3674,19 @@ struct extent_hooks_s { counters. + + + stats.arenas.<i>.mutexes.extents_pinned.{counter} + (counter specific type) r- + [] + + Statistics on arena.<i>.extents_pinned + mutex (arena scope; pinned extents related). + {counter} is one of the counters in mutex profiling + counters. + + stats.arenas.<i>.mutexes.decay_dirty.{counter} diff --git a/include/jemalloc/internal/ctl.h b/include/jemalloc/internal/ctl.h index 82035fe3..e7a8221c 100644 --- a/include/jemalloc/internal/ctl.h +++ b/include/jemalloc/internal/ctl.h @@ -63,6 +63,7 @@ typedef struct ctl_stats_s { size_t resident; size_t mapped; size_t retained; + size_t pinned; background_thread_stats_t background_thread; mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes]; diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h index af3676ff..1da4cdda 100644 --- a/include/jemalloc/internal/edata.h +++ b/include/jemalloc/internal/edata.h @@ -34,9 +34,10 @@ enum extent_state_e { extent_state_dirty = 1, extent_state_muzzy = 2, extent_state_retained = 3, - extent_state_transition = 4, /* States below are intermediate. */ - extent_state_merging = 5, - extent_state_max = 5 /* Sanity checking only. */ + extent_state_pinned = 4, + extent_state_transition = 5, /* States below are intermediate. */ + extent_state_merging = 6, + extent_state_max = 6 /* Sanity checking only. */ }; typedef enum extent_state_e extent_state_t; @@ -110,8 +111,10 @@ struct edata_s { * i: szind * f: nfree * s: bin_shard + * h: is_head + * n: pinned * - * 00000000 ... 0000ssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa + * 00000000 ... 0nhsssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa * * arena_ind: Arena from which this extent came, or all 1 bits if * unassociated. @@ -145,6 +148,10 @@ struct edata_s { * nfree: Number of free regions in slab. * * bin_shard: the shard of the bin from which this extent came. + * + * is_head: see comments in ehooks_default_merge_impl(). + * + * pinned: true if the alloc hook signaled non-reclaimable backing. */ uint64_t e_bits; #define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) \ @@ -210,6 +217,16 @@ struct edata_s { #define EDATA_BITS_IS_HEAD_MASK \ MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT) +#define EDATA_BITS_PINNED_WIDTH 1 +#define EDATA_BITS_PINNED_SHIFT \ + (EDATA_BITS_IS_HEAD_WIDTH + EDATA_BITS_IS_HEAD_SHIFT) +#define EDATA_BITS_PINNED_MASK \ + MASK(EDATA_BITS_PINNED_WIDTH, EDATA_BITS_PINNED_SHIFT) + +#if (EDATA_BITS_PINNED_SHIFT + EDATA_BITS_PINNED_WIDTH > 64) +#error "edata_t e_bits overflow" +#endif + /* Pointer to the extent that this structure is responsible for. */ void *e_addr; @@ -538,6 +555,29 @@ edata_ps_set(edata_t *edata, hpdata_t *ps) { edata->e_ps = ps; } +static inline bool +edata_pinned_get(const edata_t *edata) { + return (bool)((edata->e_bits & EDATA_BITS_PINNED_MASK) + >> EDATA_BITS_PINNED_SHIFT); +} + +static inline void +edata_pinned_set(edata_t *edata, bool pinned) { + edata->e_bits = (edata->e_bits & ~EDATA_BITS_PINNED_MASK) + | ((uint64_t)pinned << EDATA_BITS_PINNED_SHIFT); +} + +static inline void +edata_hook_flags_init(edata_t *edata, unsigned alloc_flags) { + edata_pinned_set(edata, + (alloc_flags & EXTENT_ALLOC_FLAG_PINNED) != 0); +} + +static inline unsigned +edata_alloc_flags_get(const edata_t *edata) { + return edata_pinned_get(edata) ? EXTENT_ALLOC_FLAG_PINNED : 0; +} + static inline void edata_szind_set(edata_t *edata, szind_t szind) { assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */ @@ -686,6 +726,7 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size, edata_committed_set(edata, committed); edata_pai_set(edata, pai); edata_is_head_set(edata, is_head == EXTENT_IS_HEAD); + edata_hook_flags_init(edata, 0); if (config_prof) { edata_prof_tctx_set(edata, NULL); } @@ -711,6 +752,7 @@ edata_binit( * wasting a state bit to encode this fact. */ edata_pai_set(edata, EXTENT_PAI_PAC); + edata_hook_flags_init(edata, 0); } static inline int diff --git a/include/jemalloc/internal/ehooks.h b/include/jemalloc/internal/ehooks.h index a5880a4d..128d2f2c 100644 --- a/include/jemalloc/internal/ehooks.h +++ b/include/jemalloc/internal/ehooks.h @@ -191,7 +191,7 @@ ehooks_debug_zero_check(void *addr, size_t size) { static inline void * ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size, - size_t alignment, bool *zero, bool *commit) { + size_t alignment, bool *zero, bool *commit, unsigned *alloc_flags) { bool orig_zero = *zero; void *ret; extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks); @@ -204,6 +204,18 @@ ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size, alignment, zero, commit, ehooks_ind_get(ehooks)); ehooks_post_reentrancy(tsdn); } +#if LG_PAGE < 8 +# error "Extent alloc flags require page size of at least 256" +#endif + if (ret != NULL) { + *alloc_flags = (unsigned)((uintptr_t)ret + & EXTENT_ALLOC_FLAG_MASK); + ret = (void *)((byte_t *)ret - *alloc_flags); + /* Pinned hooks must also set *commit; pinned bypasses commit/decommit. */ + assert(!(*alloc_flags & EXTENT_ALLOC_FLAG_PINNED) || *commit); + } else { + *alloc_flags = 0; + } assert(new_addr == NULL || ret == NULL || new_addr == ret); assert(!orig_zero || *zero); if (*zero && ret != NULL) { diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h index 88692356..f123d1b9 100644 --- a/include/jemalloc/internal/emap.h +++ b/include/jemalloc/internal/emap.h @@ -211,6 +211,7 @@ extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) { assert(edata_state_get(inner) == extent_state_active); assert(edata_state_get(outer) == extent_state_merging); assert(!edata_guarded_get(inner) && !edata_guarded_get(outer)); + assert(edata_pinned_get(inner) == edata_pinned_get(outer)); assert(edata_base_get(inner) == edata_past_get(outer) || edata_base_get(outer) == edata_past_get(inner)); } diff --git a/include/jemalloc/internal/eset.h b/include/jemalloc/internal/eset.h index a4cae8c5..18f6eea3 100644 --- a/include/jemalloc/internal/eset.h +++ b/include/jemalloc/internal/eset.h @@ -73,6 +73,6 @@ void eset_remove(eset_t *eset, edata_t *edata); * null if no such item could be found. */ edata_t *eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, - unsigned lg_max_fit); + unsigned lg_max_fit, bool prefer_small); #endif /* JEMALLOC_INTERNAL_ESET_H */ diff --git a/include/jemalloc/internal/extent.h b/include/jemalloc/internal/extent.h index 28b8e2d4..91aafb3a 100644 --- a/include/jemalloc/internal/extent.h +++ b/include/jemalloc/internal/extent.h @@ -120,6 +120,10 @@ extent_can_acquire_neighbor(const edata_t *edata, rtree_contents_t contents, */ return false; } + /* Do not merge pinned and non-pinned extents. */ + if (edata_pinned_get(edata) != edata_pinned_get(neighbor)) { + return false; + } } else { if (neighbor_state == extent_state_active) { return false; diff --git a/include/jemalloc/internal/mutex_prof.h b/include/jemalloc/internal/mutex_prof.h index 572200f3..37f6a377 100644 --- a/include/jemalloc/internal/mutex_prof.h +++ b/include/jemalloc/internal/mutex_prof.h @@ -30,6 +30,7 @@ typedef enum { OP(extents_dirty) \ OP(extents_muzzy) \ OP(extents_retained) \ + OP(extents_pinned) \ OP(decay_dirty) \ OP(decay_muzzy) \ OP(base) \ diff --git a/include/jemalloc/internal/pac.h b/include/jemalloc/internal/pac.h index 286be2b9..069ab107 100644 --- a/include/jemalloc/internal/pac.h +++ b/include/jemalloc/internal/pac.h @@ -51,6 +51,8 @@ struct pac_estats_s { size_t muzzy_bytes; size_t nretained; size_t retained_bytes; + size_t npinned; + size_t pinned_bytes; }; typedef struct pac_stats_s pac_stats_t; @@ -61,9 +63,14 @@ struct pac_stats_s { /* * Number of unused virtual memory bytes currently retained. Retained * bytes are technically mapped (though always decommitted or purged), - * but they are excluded from the mapped statistic (above). + * but they are excluded from pac_mapped. */ size_t retained; /* Derived. */ + /* + * Number of bytes in pinned (non-reclaimable) extents currently + * cached. Unlike retained, pinned bytes count toward pac_mapped. + */ + size_t pinned; /* Derived. */ /* * Number of bytes currently mapped, excluding retained memory (and any @@ -85,6 +92,8 @@ struct pac_s { * pointer). The handle to the allocation interface. */ pai_t pai; + /* True once pinned memory has been seen. */ + atomic_b_t has_pinned; /* * Collections of extents that were previously allocated. These are * used when allocating extents, in an attempt to re-use address space. @@ -94,6 +103,7 @@ struct pac_s { ecache_t ecache_dirty; ecache_t ecache_muzzy; ecache_t ecache_retained; + ecache_t ecache_pinned; base_t *base; emap_t *emap; @@ -160,6 +170,21 @@ pac_mapped(const pac_t *pac) { return atomic_load_zu(&pac->stats->pac_mapped, ATOMIC_RELAXED); } +void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + ecache_t *ecache, edata_t *edata); + +static inline void +pac_record_grown(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + bool pinned = edata_pinned_get(edata); + if (pinned && config_stats) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, + edata_size_get(edata), ATOMIC_RELAXED); + } + extent_record(tsdn, pac, ehooks, + pinned ? &pac->ecache_pinned : &pac->ecache_retained, edata); +} + static inline ehooks_t * pac_ehooks_get(const pac_t *pac) { return base_ehooks_get(pac->base); diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in index 793ee365..b07a09d8 100644 --- a/include/jemalloc/jemalloc_typedefs.h.in +++ b/include/jemalloc/jemalloc_typedefs.h.in @@ -4,6 +4,36 @@ extern "C" { typedef struct extent_hooks_s extent_hooks_t; +/* + * Extent alloc flags. A custom extent_alloc hook may OR these into the + * returned pointer; jemalloc strips the low bits before use. Safe because + * returned addresses are at least page-aligned (PAGE >= 256). + * + * EXTENT_ALLOC_FLAG_PINNED: backing memory is non-reclaimable. + * Pinned extents are excluded from decay/purging and cached separately for + * preferential reuse. A hook returning this flag must also set *commit to + * true: pinned memory bypasses jemalloc's commit/decommit machinery. + * + * The pinned attribute is per-extent: a single hook may return pinned and + * non-pinned extents in different calls. Pinned and non-pinned extents are + * never merged together (the merge would change the reclamation policy of + * one half), so pinned-ness is set at allocation and inherited through + * splits, but never changes after that. + * + * Example (HugeTLB alloc hook): + * void *my_alloc(extent_hooks_t *h, void *new_addr, size_t size, + * size_t alignment, bool *zero, bool *commit, unsigned arena_ind) { + * void *addr = mmap(NULL, size, PROT_READ|PROT_WRITE, + * MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB, -1, 0); + * if (addr == MAP_FAILED) return NULL; + * *zero = true; + * *commit = true; + * return (void *)((uintptr_t)addr | EXTENT_ALLOC_FLAG_PINNED); + * } + */ +#define EXTENT_ALLOC_FLAG_PINNED 0x1U +#define EXTENT_ALLOC_FLAG_MASK 0xFFU + /* * void * * extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, size_t size, diff --git a/src/arena.c b/src/arena.c index 7d9bf1df..c50fd468 100644 --- a/src/arena.c +++ b/src/arena.c @@ -797,6 +797,8 @@ arena_prepare_base_deletion(tsd_t *tsd, base_t *base_to_destroy) { tsd, &pac->ecache_muzzy.mtx, delayed_mtx, &n_delayed); arena_prepare_base_deletion_sync( tsd, &pac->ecache_retained.mtx, delayed_mtx, &n_delayed); + arena_prepare_base_deletion_sync( + tsd, &pac->ecache_pinned.mtx, delayed_mtx, &n_delayed); } arena_prepare_base_deletion_sync_finish(tsd, delayed_mtx, n_delayed); } diff --git a/src/base.c b/src/base.c index a47b6a37..76227a5e 100644 --- a/src/base.c +++ b/src/base.c @@ -52,8 +52,9 @@ base_map(tsdn_t *tsdn, ehooks_t *ehooks, unsigned ind, size_t size) { if (ehooks_are_default(ehooks)) { addr = extent_alloc_mmap(NULL, size, alignment, &zero, &commit); } else { - addr = ehooks_alloc( - tsdn, ehooks, NULL, size, alignment, &zero, &commit); + UNUSED unsigned flags; + addr = ehooks_alloc(tsdn, ehooks, NULL, size, alignment, &zero, + &commit, &flags); } return addr; diff --git a/src/ctl.c b/src/ctl.c index 0b72086c..e77e48e2 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -257,9 +257,11 @@ INDEX_PROTO(stats_arenas_i_lextents_j) CTL_PROTO(stats_arenas_i_extents_j_ndirty) CTL_PROTO(stats_arenas_i_extents_j_nmuzzy) CTL_PROTO(stats_arenas_i_extents_j_nretained) +CTL_PROTO(stats_arenas_i_extents_j_npinned) CTL_PROTO(stats_arenas_i_extents_j_dirty_bytes) CTL_PROTO(stats_arenas_i_extents_j_muzzy_bytes) CTL_PROTO(stats_arenas_i_extents_j_retained_bytes) +CTL_PROTO(stats_arenas_i_extents_j_pinned_bytes) INDEX_PROTO(stats_arenas_i_extents_j) /* Merged set of stats for HPA shard. */ @@ -320,6 +322,7 @@ CTL_PROTO(stats_arenas_i_pdirty) CTL_PROTO(stats_arenas_i_pmuzzy) CTL_PROTO(stats_arenas_i_mapped) CTL_PROTO(stats_arenas_i_retained) +CTL_PROTO(stats_arenas_i_pinned) CTL_PROTO(stats_arenas_i_extent_avail) CTL_PROTO(stats_arenas_i_dirty_npurge) CTL_PROTO(stats_arenas_i_dirty_nmadvise) @@ -355,6 +358,7 @@ CTL_PROTO(stats_metadata_thp) CTL_PROTO(stats_resident) CTL_PROTO(stats_mapped) CTL_PROTO(stats_retained) +CTL_PROTO(stats_pinned) CTL_PROTO(stats_zero_reallocs) CTL_PROTO(approximate_stats_active) CTL_PROTO(experimental_hooks_install) @@ -697,9 +701,11 @@ static const ctl_named_node_t stats_arenas_i_extents_j_node[] = { {NAME("ndirty"), CTL(stats_arenas_i_extents_j_ndirty)}, {NAME("nmuzzy"), CTL(stats_arenas_i_extents_j_nmuzzy)}, {NAME("nretained"), CTL(stats_arenas_i_extents_j_nretained)}, + {NAME("npinned"), CTL(stats_arenas_i_extents_j_npinned)}, {NAME("dirty_bytes"), CTL(stats_arenas_i_extents_j_dirty_bytes)}, {NAME("muzzy_bytes"), CTL(stats_arenas_i_extents_j_muzzy_bytes)}, - {NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}}; + {NAME("retained_bytes"), CTL(stats_arenas_i_extents_j_retained_bytes)}, + {NAME("pinned_bytes"), CTL(stats_arenas_i_extents_j_pinned_bytes)}}; static const ctl_named_node_t super_stats_arenas_i_extents_j_node[] = { {NAME(""), CHILD(named, stats_arenas_i_extents_j)}}; @@ -807,6 +813,7 @@ static const ctl_named_node_t stats_arenas_i_node[] = { {NAME("pmuzzy"), CTL(stats_arenas_i_pmuzzy)}, {NAME("mapped"), CTL(stats_arenas_i_mapped)}, {NAME("retained"), CTL(stats_arenas_i_retained)}, + {NAME("pinned"), CTL(stats_arenas_i_pinned)}, {NAME("extent_avail"), CTL(stats_arenas_i_extent_avail)}, {NAME("dirty_npurge"), CTL(stats_arenas_i_dirty_npurge)}, {NAME("dirty_nmadvise"), CTL(stats_arenas_i_dirty_nmadvise)}, @@ -872,6 +879,7 @@ static const ctl_named_node_t stats_node[] = { {NAME("resident"), CTL(stats_resident)}, {NAME("mapped"), CTL(stats_mapped)}, {NAME("retained"), CTL(stats_retained)}, + {NAME("pinned"), CTL(stats_pinned)}, {NAME("background_thread"), CHILD(named, stats_background_thread)}, {NAME("mutexes"), CHILD(named, stats_mutexes)}, {NAME("arenas"), CHILD(indexed, stats_arenas)}, @@ -1111,6 +1119,8 @@ ctl_arena_stats_sdmerge( sdstats->astats.mapped += astats->astats.mapped; sdstats->astats.pa_shard_stats.pac_stats.retained += astats->astats.pa_shard_stats.pac_stats.retained; + sdstats->astats.pa_shard_stats.pac_stats.pinned += + astats->astats.pa_shard_stats.pac_stats.pinned; sdstats->astats.pa_shard_stats.edata_avail += astats->astats.pa_shard_stats.edata_avail; } @@ -1247,12 +1257,16 @@ ctl_arena_stats_sdmerge( sdstats->estats[i].nmuzzy += astats->estats[i].nmuzzy; sdstats->estats[i].nretained += astats->estats[i].nretained; + sdstats->estats[i].npinned += + astats->estats[i].npinned; sdstats->estats[i].dirty_bytes += astats->estats[i].dirty_bytes; sdstats->estats[i].muzzy_bytes += astats->estats[i].muzzy_bytes; sdstats->estats[i].retained_bytes += astats->estats[i].retained_bytes; + sdstats->estats[i].pinned_bytes += + astats->estats[i].pinned_bytes; } /* Merge HPA stats. */ @@ -1367,6 +1381,8 @@ ctl_refresh(tsdn_t *tsdn) { ctl_stats->mapped = ctl_sarena->astats->astats.mapped; ctl_stats->retained = ctl_sarena->astats->astats.pa_shard_stats .pac_stats.retained; + ctl_stats->pinned = ctl_sarena->astats->astats.pa_shard_stats + .pac_stats.pinned; ctl_background_thread_stats_read(tsdn); @@ -3721,6 +3737,7 @@ CTL_RO_CGEN(config_stats, stats_metadata_thp, ctl_stats->metadata_thp, size_t) CTL_RO_CGEN(config_stats, stats_resident, ctl_stats->resident, size_t) CTL_RO_CGEN(config_stats, stats_mapped, ctl_stats->mapped, size_t) CTL_RO_CGEN(config_stats, stats_retained, ctl_stats->retained, size_t) +CTL_RO_CGEN(config_stats, stats_pinned, ctl_stats->pinned, size_t) CTL_RO_CGEN(config_stats, stats_background_thread_num_threads, ctl_stats->background_thread.num_threads, size_t) @@ -3786,6 +3803,8 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_mapped, arenas_i(mib[2])->astats->astats.mapped, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_retained, arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.retained, size_t) +CTL_RO_CGEN(config_stats, stats_arenas_i_pinned, + arenas_i(mib[2])->astats->astats.pa_shard_stats.pac_stats.pinned, size_t) CTL_RO_CGEN(config_stats, stats_arenas_i_extent_avail, arenas_i(mib[2])->astats->astats.pa_shard_stats.edata_avail, size_t) @@ -3958,6 +3977,7 @@ stats_mutexes_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_dirty.mtx); MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_muzzy.mtx); MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_retained.mtx); + MUTEX_PROF_RESET(arena->pa_shard.pac.ecache_pinned.mtx); MUTEX_PROF_RESET(arena->pa_shard.pac.decay_dirty.mtx); MUTEX_PROF_RESET(arena->pa_shard.pac.decay_muzzy.mtx); MUTEX_PROF_RESET(arena->tcache_ql_mtx); @@ -4034,12 +4054,16 @@ CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nmuzzy, arenas_i(mib[2])->astats->estats[mib[4]].nmuzzy, size_t); CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_nretained, arenas_i(mib[2])->astats->estats[mib[4]].nretained, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_npinned, + arenas_i(mib[2])->astats->estats[mib[4]].npinned, size_t); CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_dirty_bytes, arenas_i(mib[2])->astats->estats[mib[4]].dirty_bytes, size_t); CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_muzzy_bytes, arenas_i(mib[2])->astats->estats[mib[4]].muzzy_bytes, size_t); CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_retained_bytes, arenas_i(mib[2])->astats->estats[mib[4]].retained_bytes, size_t); +CTL_RO_CGEN(config_stats, stats_arenas_i_extents_j_pinned_bytes, + arenas_i(mib[2])->astats->estats[mib[4]].pinned_bytes, size_t); static const ctl_named_node_t * stats_arenas_i_extents_j_index( diff --git a/src/emap.c b/src/emap.c index 54bfabab..c9a371d2 100644 --- a/src/emap.c +++ b/src/emap.c @@ -50,7 +50,8 @@ emap_try_acquire_edata_neighbor_impl(tsdn_t *tsdn, emap_t *emap, edata_t *edata, assert(!edata_state_in_transition(expected_state)); assert(expected_state == extent_state_dirty || expected_state == extent_state_muzzy - || expected_state == extent_state_retained); + || expected_state == extent_state_retained + || expected_state == extent_state_pinned); void *neighbor_addr = forward ? edata_past_get(edata) : edata_before_get(edata); diff --git a/src/eset.c b/src/eset.c index 7db57ee9..bdce1834 100644 --- a/src/eset.c +++ b/src/eset.c @@ -98,7 +98,10 @@ eset_insert(eset_t *eset, edata_t *edata) { eset_stats_add(eset, pind, size); } - edata_list_inactive_append(&eset->lru, edata); + /* Pinned extents skip LRU as they do not decay. */ + if (!edata_pinned_get(edata)) { + edata_list_inactive_append(&eset->lru, edata); + } size_t npages = size >> LG_PAGE; /* * All modifications to npages hold the mutex (as asserted above), so we @@ -143,7 +146,9 @@ eset_remove(eset_t *eset, edata_t *edata) { edata_heap_first(&eset->bins[pind].heap)); } } - edata_list_inactive_remove(&eset->lru, edata); + if (!edata_pinned_get(edata)) { + edata_list_inactive_remove(&eset->lru, edata); + } size_t npages = size >> LG_PAGE; /* * As in eset_insert, we hold eset->mtx and so don't need atomic @@ -279,10 +284,15 @@ eset_fit_alignment( * avoiding reusing and splitting large extents for smaller sizes. In practice, * it's set to opt_lg_extent_max_active_fit for the dirty eset and SC_PTR_BITS * for others. + * + * If prefer_small is true, return as soon as the smallest fitting bin yields a + * candidate, instead of scanning further bins for an older/lower extent. + * Useful for fragmentation control for the pinned pool. */ static edata_t * eset_first_fit( - eset_t *eset, size_t size, bool exact_only, unsigned lg_max_fit) { + eset_t *eset, size_t size, bool exact_only, unsigned lg_max_fit, + bool prefer_small) { edata_t *ret = NULL; edata_cmp_summary_t ret_summ JEMALLOC_CC_SILENCE_INIT({0}); @@ -327,6 +337,9 @@ eset_first_fit( if (sz_large_size_classes_disabled() && pind != pind_prev) { ret = eset_enumerate_search(eset, size, pind_prev, /* exact_only */ false, &ret_summ); + if (prefer_small && ret != NULL) { + return ret; + } } for (pszind_t i = @@ -363,6 +376,9 @@ eset_first_fit( edata_cmp_summary_get(edata)) == 0); ret = edata; + if (prefer_small) { + return ret; + } ret_summ = eset->bins[i].heap_min; } if (i == SC_NPSIZES) { @@ -376,14 +392,15 @@ eset_first_fit( edata_t * eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only, - unsigned lg_max_fit) { + unsigned lg_max_fit, bool prefer_small) { size_t max_size = esize + PAGE_CEILING(alignment) - PAGE; /* Beware size_t wrap-around. */ if (max_size < esize) { return NULL; } - edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit); + edata_t *edata = eset_first_fit(eset, max_size, exact_only, lg_max_fit, + prefer_small); if (alignment > PAGE && edata == NULL) { /* diff --git a/src/extent.c b/src/extent.c index 12050f04..af5ed47e 100644 --- a/src/extent.c +++ b/src/extent.c @@ -70,6 +70,7 @@ extent_may_force_decay(pac_t *pac) { static bool extent_try_delayed_coalesce(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata_t *edata) { + malloc_mutex_assert_owner(tsdn, &ecache->mtx); emap_update_edata_state(tsdn, pac->emap, edata, extent_state_active); bool coalesced; @@ -212,6 +213,7 @@ ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, switch (ecache->state) { case extent_state_dirty: case extent_state_muzzy: + case extent_state_pinned: emap_update_edata_state( tsdn, pac->emap, edata, extent_state_active); break; @@ -244,7 +246,11 @@ extents_abandon_vm(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, } /* * Leak extent after making sure its pages have already been purged, so - * that this is only a virtual memory leak. + * that this is only a virtual memory leak, except when the extent is + * pinned/unpurgeable, for which a real memory leak happens. This is + * acceptable because reaching this path requires that an extent split + * fail, which is already an exceptional condition (typically an OOM + * on edata_t allocation). */ if (ecache->state == extent_state_dirty) { if (extent_purge_lazy_impl( @@ -434,8 +440,12 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, * then no longer satify a request for its original size. To * limit this effect, when delayed coalescing is enabled, we * put a cap on how big an extent we can split for a request. + * + * Pinned extents are exempt: they are never purged, so the cap + * doesn't apply. */ - unsigned lg_max_fit = ecache->delay_coalesce + unsigned lg_max_fit = (ecache->delay_coalesce + && ecache != &pac->ecache_pinned) ? (unsigned)opt_lg_extent_max_active_fit : SC_PTR_BITS; @@ -448,7 +458,13 @@ extent_recycle_extract(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, * allocations. */ bool exact_only = (!maps_coalesce && !opt_retain) || guarded; - edata = eset_fit(eset, size, alignment, exact_only, lg_max_fit); + /* + * When selecting a pinned extent, avoid breaking larger extent + * if a smaller one works. + */ + bool prefer_small = (ecache == &pac->ecache_pinned); + edata = eset_fit(eset, size, alignment, exact_only, lg_max_fit, + prefer_small); } if (edata == NULL) { return NULL; @@ -733,10 +749,9 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, } bool zeroed = false; bool committed = false; - - void *ptr = ehooks_alloc( - tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed, &committed); - + unsigned flags = 0; + void *ptr = ehooks_alloc(tsdn, ehooks, NULL, alloc_size, PAGE, &zeroed, + &committed, &flags); if (ptr == NULL) { edata_cache_put(tsdn, pac->edata_cache, edata); goto label_err; @@ -746,6 +761,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, edata_init(edata, ind, ptr, alloc_size, false, SC_NSIZES, extent_sn_next(pac), extent_state_active, zeroed, committed, EXTENT_PAI_PAC, EXTENT_IS_HEAD); + edata_hook_flags_init(edata, flags); + if (flags & EXTENT_ALLOC_FLAG_PINNED) { + atomic_store_b(&pac->has_pinned, true, ATOMIC_RELAXED); + } if (extent_register_no_gdump_add(tsdn, pac, edata)) { edata_cache_put(tsdn, pac->edata_cache, edata); @@ -767,12 +786,10 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, if (result == extent_split_interior_ok) { if (lead != NULL) { - extent_record( - tsdn, pac, ehooks, &pac->ecache_retained, lead); + pac_record_grown(tsdn, pac, ehooks, lead); } if (trail != NULL) { - extent_record( - tsdn, pac, ehooks, &pac->ecache_retained, trail); + pac_record_grown(tsdn, pac, ehooks, trail); } } else { /* @@ -784,8 +801,7 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, if (config_prof) { extent_gdump_add(tsdn, to_salvage); } - extent_record(tsdn, pac, ehooks, &pac->ecache_retained, - to_salvage); + pac_record_grown(tsdn, pac, ehooks, to_salvage); } if (to_leak != NULL) { extent_deregister_no_gdump_sub(tsdn, pac, to_leak); @@ -796,6 +812,8 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, } if (*commit && !edata_committed_get(edata)) { + /* Pinned memory must be committed by the hook. */ + assert(!edata_pinned_get(edata)); if (extent_commit_impl( tsdn, ehooks, edata, 0, edata_size_get(edata), true)) { extent_record( @@ -815,10 +833,13 @@ extent_grow_retained(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, /* * Increment extent_grow_next if doing so wouldn't exceed the allowed - * range. + * range. Skip for pinned: pinned memory is a finite resource; + * oversized remnants waste it. */ /* All opportunities for failure are past. */ - exp_grow_size_commit(&pac->exp_grow, exp_grow_skip); + if (!(flags & EXTENT_ALLOC_FLAG_PINNED)) { + exp_grow_size_commit(&pac->exp_grow, exp_grow_skip); + } malloc_mutex_unlock(tsdn, &pac->grow_mtx); if (huge_arena_pac_thp.thp_madvise) { @@ -1020,11 +1041,13 @@ extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata = extent_try_coalesce( tsdn, pac, ehooks, ecache, edata, &coalesced_unused); } else if (edata_size_get(edata) >= SC_LARGE_MINCLASS) { - assert(ecache == &pac->ecache_dirty); + assert(edata_pinned_get(edata) + ? (ecache == &pac->ecache_pinned) + : (ecache == &pac->ecache_dirty)); /* Always coalesce large extents eagerly. */ /** * Maximum size limit (max_size) for large extents waiting to be coalesced - * in dirty ecache. + * in pinned/dirty ecache. * * When set to a non-zero value, this parameter restricts the maximum size * of large extents after coalescing. If the combined size of two extents @@ -1056,7 +1079,9 @@ extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache, edata = extent_try_coalesce_large(tsdn, pac, ehooks, ecache, edata, max_size, &coalesced); } while (coalesced); - if (edata_size_get(edata) >= atomic_load_zu( + /* Pinned extents cannot be purged; skip the oversize shortcut. */ + if (ecache == &pac->ecache_dirty + && edata_size_get(edata) >= atomic_load_zu( &pac->oversize_threshold, ATOMIC_RELAXED) && !background_thread_enabled() && extent_may_force_decay(pac)) { @@ -1119,8 +1144,9 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, void *new_addr, return NULL; } size_t palignment = ALIGNMENT_CEILING(alignment, PAGE); - void *addr = ehooks_alloc( - tsdn, ehooks, new_addr, size, palignment, &zero, commit); + unsigned flags = 0; + void *addr = ehooks_alloc(tsdn, ehooks, new_addr, size, palignment, + &zero, commit, &flags); if (addr == NULL) { edata_cache_put(tsdn, pac->edata_cache, edata); return NULL; @@ -1129,6 +1155,10 @@ extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, void *new_addr, /* slab */ false, SC_NSIZES, extent_sn_next(pac), extent_state_active, zero, *commit, EXTENT_PAI_PAC, opt_retain ? EXTENT_IS_HEAD : EXTENT_NOT_HEAD); + edata_hook_flags_init(edata, flags); + if (flags & EXTENT_ALLOC_FLAG_PINNED) { + atomic_store_b(&pac->has_pinned, true, ATOMIC_RELAXED); + } /* * Retained memory is not counted towards gdump. Only if an extent is * allocated as a separate mapping, i.e. growing_retained is false, then @@ -1328,6 +1358,7 @@ extent_split_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata, /* slab */ false, SC_NSIZES, edata_sn_get(edata), edata_state_get(edata), edata_zeroed_get(edata), edata_committed_get(edata), EXTENT_PAI_PAC, EXTENT_NOT_HEAD); + edata_hook_flags_init(trail, edata_alloc_flags_get(edata)); emap_prepare_t prepare; bool err = emap_split_prepare( tsdn, pac->emap, &prepare, edata, size_a, trail, size_b); @@ -1412,6 +1443,8 @@ extent_merge_impl(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, : edata_sn_get(b)); edata_zeroed_set(a, edata_zeroed_get(a) && edata_zeroed_get(b)); + assert(edata_pinned_get(a) == edata_pinned_get(b)); + emap_merge_commit(tsdn, pac->emap, &prepare, a, b); edata_cache_put(tsdn, pac->edata_cache, b); diff --git a/src/pa_extra.c b/src/pa_extra.c index 8d4c3562..43dc14bf 100644 --- a/src/pa_extra.c +++ b/src/pa_extra.c @@ -34,6 +34,7 @@ pa_shard_prefork4(tsdn_t *tsdn, pa_shard_t *shard) { ecache_prefork(tsdn, &shard->pac.ecache_dirty); ecache_prefork(tsdn, &shard->pac.ecache_muzzy); ecache_prefork(tsdn, &shard->pac.ecache_retained); + ecache_prefork(tsdn, &shard->pac.ecache_pinned); if (shard->ever_used_hpa) { hpa_shard_prefork4(tsdn, &shard->hpa_shard); } @@ -50,6 +51,7 @@ pa_shard_postfork_parent(tsdn_t *tsdn, pa_shard_t *shard) { ecache_postfork_parent(tsdn, &shard->pac.ecache_dirty); ecache_postfork_parent(tsdn, &shard->pac.ecache_muzzy); ecache_postfork_parent(tsdn, &shard->pac.ecache_retained); + ecache_postfork_parent(tsdn, &shard->pac.ecache_pinned); malloc_mutex_postfork_parent(tsdn, &shard->pac.grow_mtx); malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_dirty.mtx); malloc_mutex_postfork_parent(tsdn, &shard->pac.decay_muzzy.mtx); @@ -64,6 +66,7 @@ pa_shard_postfork_child(tsdn_t *tsdn, pa_shard_t *shard) { ecache_postfork_child(tsdn, &shard->pac.ecache_dirty); ecache_postfork_child(tsdn, &shard->pac.ecache_muzzy); ecache_postfork_child(tsdn, &shard->pac.ecache_retained); + ecache_postfork_child(tsdn, &shard->pac.ecache_pinned); malloc_mutex_postfork_child(tsdn, &shard->pac.grow_mtx); malloc_mutex_postfork_child(tsdn, &shard->pac.decay_dirty.mtx); malloc_mutex_postfork_child(tsdn, &shard->pac.decay_muzzy.mtx); @@ -107,12 +110,15 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, pa_shard_stats_out->pac_stats.retained += ecache_npages_get(&shard->pac.ecache_retained) << LG_PAGE; + pa_shard_stats_out->pac_stats.pinned += + ecache_npages_get(&shard->pac.ecache_pinned) << LG_PAGE; pa_shard_stats_out->edata_avail += atomic_load_zu( &shard->edata_cache.count, ATOMIC_RELAXED); size_t resident_pgs = 0; resident_pgs += pa_shard_nactive(shard); resident_pgs += pa_shard_ndirty(shard); + resident_pgs += ecache_npages_get(&shard->pac.ecache_pinned); *resident += (resident_pgs << LG_PAGE); /* Dirty decay stats */ @@ -147,22 +153,27 @@ pa_shard_stats_merge(tsdn_t *tsdn, pa_shard_t *shard, atomic_load_zu(&shard->pac.stats->abandoned_vm, ATOMIC_RELAXED)); for (pszind_t i = 0; i < SC_NPSIZES; i++) { - size_t dirty, muzzy, retained, dirty_bytes, muzzy_bytes, - retained_bytes; + size_t dirty, muzzy, retained, pinned, dirty_bytes, + muzzy_bytes, retained_bytes, pinned_bytes; dirty = ecache_nextents_get(&shard->pac.ecache_dirty, i); muzzy = ecache_nextents_get(&shard->pac.ecache_muzzy, i); retained = ecache_nextents_get(&shard->pac.ecache_retained, i); + pinned = ecache_nextents_get(&shard->pac.ecache_pinned, i); dirty_bytes = ecache_nbytes_get(&shard->pac.ecache_dirty, i); muzzy_bytes = ecache_nbytes_get(&shard->pac.ecache_muzzy, i); retained_bytes = ecache_nbytes_get( &shard->pac.ecache_retained, i); + pinned_bytes = ecache_nbytes_get( + &shard->pac.ecache_pinned, i); estats_out[i].ndirty = dirty; estats_out[i].nmuzzy = muzzy; estats_out[i].nretained = retained; + estats_out[i].npinned = pinned; estats_out[i].dirty_bytes = dirty_bytes; estats_out[i].muzzy_bytes = muzzy_bytes; estats_out[i].retained_bytes = retained_bytes; + estats_out[i].pinned_bytes = pinned_bytes; } if (shard->ever_used_hpa) { @@ -189,6 +200,8 @@ pa_shard_mtx_stats_read(tsdn_t *tsdn, pa_shard_t *shard, &shard->pac.ecache_muzzy.mtx, arena_prof_mutex_extents_muzzy); pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, &shard->pac.ecache_retained.mtx, arena_prof_mutex_extents_retained); + pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, + &shard->pac.ecache_pinned.mtx, arena_prof_mutex_extents_pinned); pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, &shard->pac.decay_dirty.mtx, arena_prof_mutex_decay_dirty); pa_shard_mtx_stats_read_single(tsdn, mutex_prof_data, diff --git a/src/pac.c b/src/pac.c index ed0f77c2..f589880d 100644 --- a/src/pac.c +++ b/src/pac.c @@ -31,6 +31,7 @@ pac_decay_data_get(pac_t *pac, extent_state_t state, decay_t **r_decay, return; case extent_state_active: case extent_state_retained: + case extent_state_pinned: case extent_state_transition: case extent_state_merging: default: @@ -72,6 +73,12 @@ pac_init(tsdn_t *tsdn, pac_t *pac, base_t *base, emap_t *emap, /* delay_coalesce */ false)) { return true; } + /* Pinned extents: no decay, delayed coalesce. */ + if (ecache_init(tsdn, &pac->ecache_pinned, extent_state_pinned, ind, + /* delay_coalesce */ true)) { + return true; + } + atomic_store_b(&pac->has_pinned, false, ATOMIC_RELAXED); exp_grow_init(&pac->exp_grow); if (malloc_mutex_init(&pac->grow_mtx, "extent_grow", WITNESS_RANK_EXTENT_GROW, malloc_mutex_rank_exclusive)) { @@ -110,6 +117,14 @@ pac_may_have_muzzy(pac_t *pac) { return pac_decay_ms_get(pac, extent_state_muzzy) != 0; } +static inline void +pac_ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, + edata_t *edata) { + ecache_dalloc(tsdn, pac, ehooks, + edata_pinned_get(edata) ? &pac->ecache_pinned : &pac->ecache_dirty, + edata); +} + static size_t pac_alloc_retained_batched_size(size_t size) { if (size > SC_LARGE_MAXCLASS) { @@ -133,8 +148,22 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, assert(!guarded || alignment <= PAGE); size_t newly_mapped_size = 0; - edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, - NULL, size, alignment, zero, guarded); + edata_t *edata = NULL; + + /* + * Guarded allocations need surrounding guard pages, which the pinned + * pool does not maintain; skip ecache_pinned in that case. + */ + if (!guarded && atomic_load_b(&pac->has_pinned, ATOMIC_RELAXED) + && ecache_npages_get(&pac->ecache_pinned) > 0) { + edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_pinned, + NULL, size, alignment, zero, guarded); + } + + if (edata == NULL) { + edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + NULL, size, alignment, zero, guarded); + } if (edata == NULL && pac_may_have_muzzy(pac)) { edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, @@ -180,12 +209,10 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, edata, size, batched_size - size, /* holding_core_locks */ false); if (trail == NULL) { - ecache_dalloc(tsdn, pac, ehooks, - &pac->ecache_retained, edata); + pac_record_grown(tsdn, pac, ehooks, edata); edata = NULL; } else { - ecache_dalloc(tsdn, pac, ehooks, - &pac->ecache_dirty, trail); + pac_ecache_dalloc(tsdn, pac, ehooks, trail); } } @@ -277,23 +304,53 @@ pac_expand_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, if (ehooks_merge_will_fail(ehooks)) { return true; } - edata_t *trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, - edata, expand_amount, PAGE, zero, /* guarded*/ false); - if (trail == NULL) { - trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, - edata, expand_amount, PAGE, zero, /* guarded*/ false); + edata_t *trail = NULL; + if (edata_pinned_get(edata)) { + trail = ecache_alloc(tsdn, pac, ehooks, + &pac->ecache_pinned, edata, expand_amount, + PAGE, zero, /* guarded */ false); + if (trail == NULL) { + /* + * Only ecache_pinned can hold a mergeable neighbor; + * dirty, muzzy, and retained extents are non-pinned. + * Pinned memory is already committed, and hooks are + * unlikely to reserve adjacent pinned space for growth, + * so don't consult the hook to grow in place. + */ + return true; + } + assert(edata_pinned_get(trail)); + } else { + trail = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, + edata, expand_amount, PAGE, zero, /* guarded */ false); + if (trail == NULL) { + trail = ecache_alloc(tsdn, pac, ehooks, + &pac->ecache_muzzy, edata, expand_amount, + PAGE, zero, /* guarded */ false); + } + if (trail == NULL) { + trail = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, edata, expand_amount, + PAGE, zero, /* guarded */ false); + mapped_add = expand_amount; + } + if (trail == NULL) { + return true; + } } - if (trail == NULL) { - trail = ecache_alloc_grow(tsdn, pac, ehooks, - &pac->ecache_retained, edata, expand_amount, PAGE, zero, - /* guarded */ false); - mapped_add = expand_amount; - } - if (trail == NULL) { - return true; - } - if (extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) { - extent_dalloc_wrapper(tsdn, pac, ehooks, trail); + /* extent_merge_wrapper requires matching pinnedness. */ + if ((edata_pinned_get(edata) != edata_pinned_get(trail)) + || extent_merge_wrapper(tsdn, pac, ehooks, edata, trail)) { + if (edata_pinned_get(trail)) { + if (config_stats) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, + mapped_add, ATOMIC_RELAXED); + } + ecache_dalloc(tsdn, pac, ehooks, + &pac->ecache_pinned, trail); + } else { + extent_dalloc_wrapper(tsdn, pac, ehooks, trail); + } return true; } if (config_stats && mapped_add > 0) { @@ -320,8 +377,11 @@ pac_shrink_impl(tsdn_t *tsdn, pai_t *self, edata_t *edata, size_t old_size, if (trail == NULL) { return true; } - ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, trail); - *deferred_work_generated = true; + bool pinned = edata_pinned_get(trail); + pac_ecache_dalloc(tsdn, pac, ehooks, trail); + if (!pinned) { + *deferred_work_generated = true; + } return false; } @@ -352,9 +412,11 @@ pac_dalloc_impl( } } - ecache_dalloc(tsdn, pac, ehooks, &pac->ecache_dirty, edata); - /* Purging of deallocated pages is deferred */ - *deferred_work_generated = true; + bool pinned = edata_pinned_get(edata); + pac_ecache_dalloc(tsdn, pac, ehooks, edata); + if (!pinned) { + *deferred_work_generated = true; + } } static inline uint64_t @@ -543,6 +605,7 @@ pac_decay_stashed(tsdn_t *tsdn, pac_t *pac, decay_t *decay, break; case extent_state_active: case extent_state_retained: + case extent_state_pinned: case extent_state_transition: case extent_state_merging: default: @@ -721,7 +784,48 @@ pac_destroy(tsdn_t *tsdn, pac_t *pac) { * dss-based extents for later reuse. */ ehooks_t *ehooks = pac_ehooks_get(pac); - edata_t *edata; + edata_t *edata; + if (atomic_load_b(&pac->has_pinned, ATOMIC_RELAXED)) { + /* + * Reroute pinned extents through ecache_retained: clearing the + * pinned bit lets retained's eager coalesce merge fragments + * back to their original OS-allocation bases, so the destroy + * hook can release whole reservations (required on platforms + * like Windows where VirtualFree only accepts the original + * VirtualAlloc base). Subtract from pac_mapped along the way + * because retained is excluded from stats.mapped. + */ + edata_list_inactive_t pinned_list; + edata_list_inactive_init(&pinned_list); + malloc_mutex_lock(tsdn, &pac->ecache_pinned.mtx); + assert(eset_npages_get(&pac->ecache_pinned.guarded_eset) == 0); + size_t pinned_bytes = + eset_npages_get(&pac->ecache_pinned.eset) << LG_PAGE; + while (eset_npages_get(&pac->ecache_pinned.eset) > 0) { + edata = eset_fit(&pac->ecache_pinned.eset, + PAGE, PAGE, /* exact_only */ false, SC_PTR_BITS, + /* prefer_small */ false); + assert(edata != NULL); + assert(edata_pinned_get(edata)); + eset_remove(&pac->ecache_pinned.eset, edata); + emap_update_edata_state(tsdn, pac->emap, edata, + extent_state_active); + edata_pinned_set(edata, false); + edata_list_inactive_append(&pinned_list, edata); + } + malloc_mutex_unlock(tsdn, &pac->ecache_pinned.mtx); + if (config_stats && pinned_bytes > 0) { + atomic_fetch_sub_zu(&pac->stats->pac_mapped, + pinned_bytes, ATOMIC_RELAXED); + } + while ((edata = edata_list_inactive_first(&pinned_list)) + != NULL) { + edata_list_inactive_remove(&pinned_list, edata); + extent_record(tsdn, pac, ehooks, + &pac->ecache_retained, edata); + } + } + assert(ecache_npages_get(&pac->ecache_pinned) == 0); while ( (edata = ecache_evict(tsdn, pac, ehooks, &pac->ecache_retained, 0)) != NULL) { diff --git a/src/stats.c b/src/stats.c index 82458fec..33198636 100644 --- a/src/stats.c +++ b/src/stats.c @@ -712,6 +712,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) { COL_HDR(row, muzzy, NULL, right, 13, size) COL_HDR(row, nretained, NULL, right, 13, size) COL_HDR(row, retained, NULL, right, 13, size) + COL_HDR(row, npinned, NULL, right, 13, size) + COL_HDR(row, pinned, NULL, right, 13, size) COL_HDR(row, ntotal, NULL, right, 13, size) COL_HDR(row, total, NULL, right, 13, size) @@ -728,22 +730,27 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) { in_gap = false; for (j = 0; j < SC_NPSIZES; j++) { - size_t ndirty, nmuzzy, nretained, total, dirty_bytes, - muzzy_bytes, retained_bytes, total_bytes; + size_t ndirty, nmuzzy, nretained, npinned, total, + dirty_bytes, muzzy_bytes, retained_bytes, pinned_bytes, + total_bytes; stats_arenas_mib[4] = j; CTL_LEAF(stats_arenas_mib, 5, "ndirty", &ndirty, size_t); CTL_LEAF(stats_arenas_mib, 5, "nmuzzy", &nmuzzy, size_t); CTL_LEAF(stats_arenas_mib, 5, "nretained", &nretained, size_t); + CTL_LEAF(stats_arenas_mib, 5, "npinned", &npinned, size_t); CTL_LEAF( stats_arenas_mib, 5, "dirty_bytes", &dirty_bytes, size_t); CTL_LEAF( stats_arenas_mib, 5, "muzzy_bytes", &muzzy_bytes, size_t); CTL_LEAF(stats_arenas_mib, 5, "retained_bytes", &retained_bytes, size_t); + CTL_LEAF(stats_arenas_mib, 5, "pinned_bytes", &pinned_bytes, + size_t); - total = ndirty + nmuzzy + nretained; - total_bytes = dirty_bytes + muzzy_bytes + retained_bytes; + total = ndirty + nmuzzy + nretained + npinned; + total_bytes = dirty_bytes + muzzy_bytes + retained_bytes + + pinned_bytes; in_gap_prev = in_gap; in_gap = (total == 0); @@ -758,6 +765,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) { emitter_json_kv(emitter, "nmuzzy", emitter_type_size, &nmuzzy); emitter_json_kv( emitter, "nretained", emitter_type_size, &nretained); + emitter_json_kv( + emitter, "npinned", emitter_type_size, &npinned); emitter_json_kv( emitter, "dirty_bytes", emitter_type_size, &dirty_bytes); @@ -765,6 +774,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) { emitter, "muzzy_bytes", emitter_type_size, &muzzy_bytes); emitter_json_kv(emitter, "retained_bytes", emitter_type_size, &retained_bytes); + emitter_json_kv(emitter, "pinned_bytes", emitter_type_size, + &pinned_bytes); emitter_json_object_end(emitter); col_size.size_val = sz_pind2sz(j); @@ -775,6 +786,8 @@ stats_arena_extents_print(emitter_t *emitter, unsigned i) { col_muzzy.size_val = muzzy_bytes; col_nretained.size_val = nretained; col_retained.size_val = retained_bytes; + col_npinned.size_val = npinned; + col_pinned.size_val = pinned_bytes; col_ntotal.size_val = total; col_total.size_val = total_bytes; @@ -1166,7 +1179,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large, unsigned nthreads; const char *dss; ssize_t dirty_decay_ms, muzzy_decay_ms; - size_t page, pactive, pdirty, pmuzzy, mapped, retained; + size_t page, pactive, pdirty, pmuzzy, mapped, retained, pinned; size_t base, internal, resident, metadata_edata, metadata_rtree, metadata_thp, extent_avail; uint64_t dirty_npurge, dirty_nmadvise, dirty_purged; @@ -1467,6 +1480,7 @@ stats_arena_print(emitter_t *emitter, unsigned i, bool bins, bool large, GET_AND_EMIT_MEM_STAT(mapped) GET_AND_EMIT_MEM_STAT(retained) + GET_AND_EMIT_MEM_STAT(pinned) GET_AND_EMIT_MEM_STAT(base) GET_AND_EMIT_MEM_STAT(internal) GET_AND_EMIT_MEM_STAT(metadata_edata) @@ -1872,7 +1886,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, * the transition to the emitter code. */ size_t allocated, active, metadata, metadata_edata, metadata_rtree, - metadata_thp, resident, mapped, retained; + metadata_thp, resident, mapped, retained, pinned; size_t num_background_threads; size_t zero_reallocs; uint64_t background_thread_num_runs, background_thread_run_interval; @@ -1886,6 +1900,7 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, CTL_GET("stats.resident", &resident, size_t); CTL_GET("stats.mapped", &mapped, size_t); CTL_GET("stats.retained", &retained, size_t); + CTL_GET("stats.pinned", &pinned, size_t); CTL_GET("stats.zero_reallocs", &zero_reallocs, size_t); @@ -1916,15 +1931,16 @@ stats_print_helper(emitter_t *emitter, bool merged, bool destroyed, emitter_json_kv(emitter, "resident", emitter_type_size, &resident); emitter_json_kv(emitter, "mapped", emitter_type_size, &mapped); emitter_json_kv(emitter, "retained", emitter_type_size, &retained); + emitter_json_kv(emitter, "pinned", emitter_type_size, &pinned); emitter_json_kv( emitter, "zero_reallocs", emitter_type_size, &zero_reallocs); emitter_table_printf(emitter, "Allocated: %zu, active: %zu, " "metadata: %zu (n_thp %zu, edata %zu, rtree %zu), resident: %zu, " - "mapped: %zu, retained: %zu\n", + "mapped: %zu, retained: %zu, pinned: %zu\n", allocated, active, metadata, metadata_thp, metadata_edata, - metadata_rtree, resident, mapped, retained); + metadata_rtree, resident, mapped, retained, pinned); /* Strange behaviors */ emitter_table_printf(emitter, diff --git a/test/unit/extent_alloc_flags.c b/test/unit/extent_alloc_flags.c new file mode 100644 index 00000000..63c22c85 --- /dev/null +++ b/test/unit/extent_alloc_flags.c @@ -0,0 +1,498 @@ +#include "test/jemalloc_test.h" + +static void * +pinned_extent_alloc(extent_hooks_t *extent_hooks, void *new_addr, + size_t size, size_t alignment, bool *zero, bool *commit, + unsigned arena_ind) { + void *ret = ehooks_default_extent_hooks.alloc( + (extent_hooks_t *)&ehooks_default_extent_hooks, + new_addr, size, alignment, zero, commit, arena_ind); + if (ret == NULL) { + return NULL; + } + if (!*commit) { + if (ehooks_default_extent_hooks.commit != NULL && + ehooks_default_extent_hooks.commit( + (extent_hooks_t *)&ehooks_default_extent_hooks, ret, size, + 0, size, arena_ind)) { + ehooks_default_extent_hooks.dalloc( + (extent_hooks_t *)&ehooks_default_extent_hooks, ret, + size, *commit, arena_ind); + return NULL; + } + *commit = true; + } + return (void *)((uintptr_t)ret | EXTENT_ALLOC_FLAG_PINNED); +} + +static unsigned pinned_split_calls; +static unsigned pinned_destroy_calls; +static size_t pinned_destroy_bytes; + +static bool +pinned_extent_split(extent_hooks_t *extent_hooks, void *addr, size_t size, + size_t size_a, size_t size_b, bool committed, unsigned arena_ind) { + pinned_split_calls++; + return ehooks_default_extent_hooks.split( + (extent_hooks_t *)&ehooks_default_extent_hooks, addr, size, size_a, + size_b, committed, arena_ind); +} + +static bool +pinned_extent_merge(extent_hooks_t *extent_hooks, void *addr_a, size_t size_a, + void *addr_b, size_t size_b, bool committed, unsigned arena_ind) { + return ehooks_default_extent_hooks.merge( + (extent_hooks_t *)&ehooks_default_extent_hooks, addr_a, size_a, + addr_b, size_b, committed, arena_ind); +} + +static void +pinned_extent_destroy(extent_hooks_t *extent_hooks, void *addr, size_t size, + bool committed, unsigned arena_ind) { + pinned_destroy_calls++; + pinned_destroy_bytes += size; + ehooks_default_extent_hooks.destroy( + (extent_hooks_t *)&ehooks_default_extent_hooks, addr, size, + committed, arena_ind); +} + +static void +pinned_hooks_reset(void) { + pinned_split_calls = 0; + pinned_destroy_calls = 0; + pinned_destroy_bytes = 0; +} + +static extent_hooks_t pinned_hooks = { + pinned_extent_alloc, + NULL, /* dalloc — force retain */ + pinned_extent_destroy, + NULL, /* commit */ + NULL, /* decommit */ + NULL, /* purge_lazy */ + NULL, /* purge_forced */ + pinned_extent_split, + pinned_extent_merge +}; + +static size_t +get_arena_mapped(unsigned arena_ind) { + uint64_t epoch = 1; + size_t epoch_sz = sizeof(epoch); + expect_d_eq(0, mallctl("epoch", &epoch, &epoch_sz, &epoch, + sizeof(epoch)), "epoch failed"); + size_t mapped; + size_t mapped_sz = sizeof(mapped); + char buf[64]; + snprintf(buf, sizeof(buf), "stats.arenas.%u.mapped", arena_ind); + expect_d_eq(0, mallctl(buf, &mapped, &mapped_sz, NULL, 0), + "stats.arenas..mapped read failed"); + return mapped; +} + +/* + * Non-dependent emap lookup: returns the edata for addr, or NULL if the + * rtree leaf does not exist (safe for addresses that jemalloc may never + * have mapped, e.g. after arena destroy). + */ +static edata_t * +emap_edata_try_lookup(const void *ptr) { + emap_full_alloc_ctx_t ctx; + bool err = emap_full_alloc_ctx_try_lookup(TSDN_NULL, + &arena_emap_global, ptr, &ctx); + if (err) { + return NULL; + } + return ctx.edata; +} + +/* + * Find the edata covering addr by walking the emap from addr in PAGE + * strides. Returns NULL if no covering edata is found within max_bytes. + */ +static edata_t * +find_covering_edata(const void *addr, size_t max_bytes) { + uintptr_t a = (uintptr_t)addr; + edata_t *back = NULL; + for (size_t off = 0; off <= max_bytes; off += PAGE) { + back = emap_edata_try_lookup((void *)(a - off)); + if (back != NULL) { + break; + } + } + if (back == NULL) { + return NULL; + } + edata_t *fwd = NULL; + for (size_t off = 0; off <= max_bytes; off += PAGE) { + fwd = emap_edata_try_lookup((void *)(a + off)); + if (fwd != NULL) { + break; + } + } + return (back == fwd) ? back : NULL; +} + +TEST_BEGIN(test_pinned_stats) { + test_skip_if(!config_stats); + pinned_hooks_reset(); + + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + /* Create arena with pinned hooks. */ + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + /* Allocate and free to populate ecache_pinned. */ + void *p = mallocx(PAGE * 4, MALLOCX_ARENA(arena_ind) + | MALLOCX_TCACHE_NONE); + expect_ptr_not_null(p, "alloc failed"); + dallocx(p, MALLOCX_TCACHE_NONE); + + /* Refresh stats. */ + uint64_t epoch = 1; + sz = sizeof(epoch); + expect_d_eq(0, mallctl("epoch", &epoch, &sz, &epoch, sizeof(epoch)), + "epoch failed"); + + /* Read total pinned stat. */ + char buf[128]; + size_t pinned_total; + sz = sizeof(pinned_total); + snprintf(buf, sizeof(buf), "stats.arenas.%u.pinned", arena_ind); + expect_d_eq(0, mallctl(buf, &pinned_total, &sz, NULL, 0), + "stats.arenas..pinned read failed"); + expect_zu_gt(pinned_total, 0, + "pinned total should be > 0 after free to pinned arena"); + + /* Pinned bytes are part of stats.mapped (unlike retained). */ + expect_zu_ge(get_arena_mapped(arena_ind), pinned_total, + "stats.mapped should include pinned bytes"); + + /* Destroy the arena. */ + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_shrink) { + test_skip_if(ehooks_default_split_impl()); + pinned_hooks_reset(); + + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + /* Create arena with pinned hooks. */ + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + void *ptr = mallocx(SC_LARGE_MINCLASS + PAGE, flags); + expect_ptr_not_null(ptr, "alloc failed"); + unsigned split_calls_before = pinned_split_calls; + void *shrunk = rallocx(ptr, SC_LARGE_MINCLASS, flags); + expect_ptr_not_null(shrunk, "shrink failed"); + expect_u_gt(pinned_split_calls, split_calls_before, + "shrink should invoke the split hook"); + dallocx(shrunk, MALLOCX_TCACHE_NONE); + + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_remnant_lock) { + test_skip_if(!opt_retain); + pinned_hooks_reset(); + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + + void *p1 = mallocx(SC_LARGE_MINCLASS, flags); + expect_ptr_not_null(p1, "first alloc failed"); + + tsd_t *tsd = tsd_fetch(); + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + expect_ptr_not_null(arena, "arena_get failed"); + expect_zu_gt(ecache_npages_get(&arena->pa_shard.pac.ecache_pinned), + 0, "grow remnant should be cached in ecache_pinned"); + + dallocx(p1, MALLOCX_TCACHE_NONE); + + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_reuse) { + pinned_hooks_reset(); + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + + void *p1 = mallocx(PAGE * 2, flags); + expect_ptr_not_null(p1, "first alloc failed"); + dallocx(p1, MALLOCX_TCACHE_NONE); + + void *p2 = mallocx(PAGE * 2, flags); + expect_ptr_not_null(p2, "reuse alloc failed"); + expect_ptr_eq(p1, p2, + "pinned extent should be reused at the same address"); + dallocx(p2, MALLOCX_TCACHE_NONE); + + /* Destroy the arena. */ + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_realloc) { + test_skip_if(ehooks_default_split_impl()); + pinned_hooks_reset(); + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + + /* + * Reallocs within the large-class range shrink/grow in place; the + * pointer stays at the original address rather than moving to a + * fresh extent (important since pinned memory is finite). All sizes + * are multiples of SC_LARGE_MINCLASS so they bypass the slab path + * regardless of LG_PAGE / SC_LG_NGROUP. + */ + void *p = mallocx(SC_LARGE_MINCLASS * 7, flags); + expect_ptr_not_null(p, "initial alloc failed"); + void *original = p; + + p = rallocx(p, SC_LARGE_MINCLASS * 3, flags); + expect_ptr_not_null(p, "shrink failed"); + expect_ptr_eq(p, original, "shrink should preserve address"); + + p = rallocx(p, SC_LARGE_MINCLASS * 6, flags); + expect_ptr_not_null(p, "regrow failed"); + expect_ptr_eq(p, original, "regrow should preserve address"); + + dallocx(p, MALLOCX_TCACHE_NONE); + + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_reset) { + pinned_hooks_reset(); + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + + /* Allocate several pinned extents, leave them live. */ + void *ptrs[4]; + for (int i = 0; i < 4; i++) { + ptrs[i] = mallocx(SC_LARGE_MINCLASS * (i + 1), flags); + expect_ptr_not_null(ptrs[i], "alloc %d failed", i); + } + + /* + * arena..reset returns live allocations to the caches without + * destroying the arena. No destroy hook calls should fire (pinned + * extents stay in ecache_pinned), and the arena should still be + * usable for further allocations afterward. + */ + unsigned destroy_calls_before = pinned_destroy_calls; + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.reset", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena reset failed"); + expect_u_eq(pinned_destroy_calls, destroy_calls_before, + "reset should not invoke the destroy hook"); + + tsd_t *tsd = tsd_fetch(); + arena_t *arena = arena_get(tsd_tsdn(tsd), arena_ind, false); + expect_ptr_not_null(arena, "arena_get failed"); + expect_zu_gt(ecache_npages_get(&arena->pa_shard.pac.ecache_pinned), 0, + "pinned ecache should hold the reset extents"); + + /* Arena is still usable: alloc and free again. */ + void *p = mallocx(SC_LARGE_MINCLASS, flags); + expect_ptr_not_null(p, "post-reset alloc failed"); + dallocx(p, MALLOCX_TCACHE_NONE); + + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); +} +TEST_END + +TEST_BEGIN(test_pinned_destroy) { + pinned_hooks_reset(); + + unsigned arena_ind; + size_t sz = sizeof(arena_ind); + extent_hooks_t *hooks_ptr = &pinned_hooks; + expect_d_eq(0, mallctl("arenas.create", &arena_ind, &sz, + &hooks_ptr, sizeof(hooks_ptr)), + "arena creation failed"); + + int flags = MALLOCX_ARENA(arena_ind) | MALLOCX_TCACHE_NONE; + + size_t mapped_initial = config_stats ? get_arena_mapped(arena_ind) : 0; + + void *ptrs[4]; + size_t sizes[4] = {PAGE * 4, SC_LARGE_MINCLASS, + SC_LARGE_MINCLASS + PAGE, SC_LARGE_MINCLASS * 2}; + size_t total = 0; + for (int i = 0; i < 4; i++) { + ptrs[i] = mallocx(sizes[i], flags); + expect_ptr_not_null(ptrs[i], "alloc %d failed", i); + total += sizes[i]; + } + + if (config_stats) { + expect_zu_ge(get_arena_mapped(arena_ind) - mapped_initial, + total, + "mapped should grow by at least total after pinned allocs"); + } + + /* + * Stress alloc/dalloc/realloc churn before the final teardown. Mix + * sizes (some at the slab boundary, some multi-large-class), and on + * platforms that support splitting, alternate shrink and regrow on + * the same extent to stress the in-place realloc accounting paths. + */ + bool can_split = !ehooks_default_split_impl(); + for (int round = 0; round < 64; round++) { + size_t s = SC_LARGE_MINCLASS + PAGE * (round % 4); + void *p = mallocx(s, flags); + expect_ptr_not_null(p, "churn alloc %d failed", round); + if (can_split && (round & 1)) { + void *shrunk = rallocx(p, SC_LARGE_MINCLASS, flags); + expect_ptr_not_null(shrunk, "churn shrink %d failed", + round); + void *regrown = rallocx(shrunk, SC_LARGE_MINCLASS * 3, + flags); + expect_ptr_not_null(regrown, "churn regrow %d failed", + round); + dallocx(regrown, MALLOCX_TCACHE_NONE); + } else { + void *p2 = rallocx(p, SC_LARGE_MINCLASS * 2, flags); + expect_ptr_not_null(p2, "churn realloc %d failed", + round); + dallocx(p2, MALLOCX_TCACHE_NONE); + } + } + /* Free and re-alloc one of the original pointers to exercise reuse. */ + dallocx(ptrs[1], MALLOCX_TCACHE_NONE); + ptrs[1] = mallocx(sizes[1], flags); + expect_ptr_not_null(ptrs[1], "reuse alloc failed"); + + for (int i = 0; i < 4; i++) { + dallocx(ptrs[i], MALLOCX_TCACHE_NONE); + } + + if (config_stats) { + expect_zu_ge(get_arena_mapped(arena_ind) - mapped_initial, + total, + "pinned bytes should remain in mapped after dalloc " + "(no decay for pinned)"); + } + + tsd_t *tsd = tsd_fetch(); + tsdn_t *tsdn = tsd_tsdn(tsd); + arena_t *arena = arena_get(tsdn, arena_ind, false); + expect_ptr_not_null(arena, "arena_get failed"); + size_t pinned_bytes = + ecache_npages_get(&arena->pa_shard.pac.ecache_pinned) << LG_PAGE; + expect_zu_gt(pinned_bytes, 0, + "pinned ecache should contain the freed extents"); + + /* + * Pinned extents stay registered in the emap after dalloc. Coalescing + * may have made any individual ptrs[i] interior to a larger merged + * extent, so use find_covering_edata to walk to the merged extent's + * base/last-page; each must resolve to a pinned-state edata. The + * search bound is the total bytes currently in ecache_pinned, which + * upper-bounds the size of any covering merged extent. + */ + for (int i = 0; i < 4; i++) { + edata_t *covering = find_covering_edata(ptrs[i], + pinned_bytes); + expect_ptr_not_null(covering, + "freed pinned extent ptrs[%d] should still be reachable " + "in the emap", i); + expect_d_eq(edata_state_get(covering), extent_state_pinned, + "covering extent for ptrs[%d] should be in pinned state", + i); + } + + char buf[64]; + snprintf(buf, sizeof(buf), "arena.%u.destroy", arena_ind); + unsigned destroy_calls_before = pinned_destroy_calls; + size_t destroy_bytes_before = pinned_destroy_bytes; + expect_d_eq(0, mallctl(buf, NULL, NULL, NULL, 0), + "arena destroy failed"); + expect_u_gt(pinned_destroy_calls, destroy_calls_before, + "arena destroy should invoke the destroy hook"); + expect_zu_ge(pinned_destroy_bytes - destroy_bytes_before, total, + "destroy hook should be called with at least the allocated total " + "(coalesced fragments returned to OS)"); + if (maps_coalesce) { + expect_u_lt(pinned_destroy_calls - destroy_calls_before, 4, + "destroy calls should be < #allocs after coalesce"); + } + for (int i = 0; i < 4; i++) { + expect_ptr_null(find_covering_edata(ptrs[i], pinned_bytes), + "arena destroy should clear the emap entry for ptrs[%d]", + i); + } +} +TEST_END + +int +main(void) { + return test_no_reentrancy( + test_pinned_remnant_lock, + test_pinned_reuse, + test_pinned_realloc, + test_pinned_stats, + test_pinned_shrink, + test_pinned_reset, + test_pinned_destroy); +}