diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h index 56c63123..7eef7bce 100644 --- a/include/jemalloc/internal/sz.h +++ b/include/jemalloc/internal/sz.h @@ -310,6 +310,17 @@ sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) { } } +JEMALLOC_ALWAYS_INLINE size_t +sz_s2u_size_class_compute(size_t size) { + size_t x = lg_floor((size<<1)-1); + size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) + ? LG_QUANTUM : x - SC_LG_NGROUP - 1; + size_t delta = ZU(1) << lg_delta; + size_t delta_mask = delta - 1; + size_t usize = (size + delta_mask) & ~delta_mask; + return usize; +} + JEMALLOC_ALWAYS_INLINE size_t sz_s2u_compute(size_t size) { if (unlikely(size > SC_LARGE_MAXCLASS)) { @@ -328,13 +339,7 @@ sz_s2u_compute(size_t size) { } #endif if (!sz_limit_usize_gap_enabled() || size <= SC_SMALL_MAXCLASS) { - size_t x = lg_floor((size<<1)-1); - size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1) - ? LG_QUANTUM : x - SC_LG_NGROUP - 1; - size_t delta = ZU(1) << lg_delta; - size_t delta_mask = delta - 1; - size_t usize = (size + delta_mask) & ~delta_mask; - return usize; + return sz_s2u_size_class_compute(size); } else { size_t usize = ((size + PAGE - 1) >> LG_PAGE) << LG_PAGE; assert(usize - size < PAGE); diff --git a/src/pac.c b/src/pac.c index 57a0c953..28cf2c39 100644 --- a/src/pac.c +++ b/src/pac.c @@ -112,10 +112,24 @@ pac_may_have_muzzy(pac_t *pac) { return pac_decay_ms_get(pac, extent_state_muzzy) != 0; } +size_t pac_batched_alloc_retained_size(size_t size) { + if (size > SC_LARGE_MAXCLASS) { + return size; + } + size_t batched_size = sz_s2u_size_class_compute(size); + size_t capped_size = ((size + HUGEPAGE - 1) >> LG_HUGEPAGE) + << LG_HUGEPAGE; + if (batched_size >= capped_size) { + batched_size = capped_size; + } + return batched_size; +} + static edata_t * pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, size_t alignment, bool zero, bool guarded) { assert(!guarded || alignment <= PAGE); + size_t newly_mapped_size = 0; edata_t *edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_dirty, NULL, size, alignment, zero, guarded); @@ -124,14 +138,64 @@ pac_alloc_real(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, size_t size, edata = ecache_alloc(tsdn, pac, ehooks, &pac->ecache_muzzy, NULL, size, alignment, zero, guarded); } + + /* + * We batched allocate a larger extent when limit_usize_gap is enabled + * because the reuse of extents in the dirty pool is worse without size + * classes for large allocs. For instance, when limit_usize_gap is not + * enabled, 1.1MB, 1.15MB, and 1.2MB allocs will all be ceiled to + * 1.25MB and can reuse the same buffer if they are alloc & dalloc + * sequentially. However, with limit_usize_gap enabled, they cannot + * reuse the same buffer and their sequential allocs & dallocs will + * result in three different extents. Thus, we cache extra mergeable + * extents in the dirty pool to improve the reuse. We skip this + * optimization if both maps_coalesce and opt_retain are disabled + * because VM is not cheap enough to be used aggressively and extents + * cannot be merged at will (only extents from the same VirtualAlloc + * can be merged). Note that it could still be risky to cache more + * extents when either mpas_coalesce or opt_retain is enabled. Yet + * doing so is still beneficial in improving the reuse of extents + * with some limits. This choice should be reevaluated if + * pac_batched_alloc_retained_size is changed to be more aggressive. + */ + if (sz_limit_usize_gap_enabled() && edata == NULL && + (maps_coalesce || opt_retain)) { + size_t batched_size = pac_batched_alloc_retained_size( + size); + edata = ecache_alloc_grow(tsdn, pac, ehooks, + &pac->ecache_retained, NULL, batched_size, + alignment, zero, guarded); + + if (edata != NULL && batched_size > size) { + edata_t *trail = extent_split_wrapper(tsdn, pac, + ehooks, edata, size, batched_size - size, + /* holding_core_locks */ false); + if (trail == NULL) { + ecache_dalloc(tsdn, pac, ehooks, + &pac->ecache_retained, edata); + edata = NULL; + } else { + ecache_dalloc(tsdn, pac, ehooks, + &pac->ecache_dirty, trail); + + } + } + + if (edata != NULL) { + newly_mapped_size = batched_size; + } + } + if (edata == NULL) { edata = ecache_alloc_grow(tsdn, pac, ehooks, &pac->ecache_retained, NULL, size, alignment, zero, guarded); - if (config_stats && edata != NULL) { - atomic_fetch_add_zu(&pac->stats->pac_mapped, size, - ATOMIC_RELAXED); - } + newly_mapped_size = size; + } + + if (config_stats && newly_mapped_size != 0) { + atomic_fetch_add_zu(&pac->stats->pac_mapped, + newly_mapped_size, ATOMIC_RELAXED); } return edata; diff --git a/test/unit/arena_decay.c b/test/unit/arena_decay.c index 10d1a6b1..00a38326 100644 --- a/test/unit/arena_decay.c +++ b/test/unit/arena_decay.c @@ -410,7 +410,14 @@ TEST_BEGIN(test_decay_never) { /* Verify that each deallocation generates additional dirty pages. */ size_t pdirty_prev = get_arena_pdirty(arena_ind); size_t pmuzzy_prev = get_arena_pmuzzy(arena_ind); - expect_zu_eq(pdirty_prev, 0, "Unexpected dirty pages"); + /* + * With limit_usize_gap enabled, some more extents + * are cached in the dirty pool, making the assumption below + * not true. + */ + if (!sz_limit_usize_gap_enabled()) { + expect_zu_eq(pdirty_prev, 0, "Unexpected dirty pages"); + } expect_zu_eq(pmuzzy_prev, 0, "Unexpected muzzy pages"); for (unsigned i = 0; i < sizeof(sizes)/sizeof(size_t); i++) { dallocx(ptrs[i], flags);