Introducing a new usize calculation policy

Converting size to usize is what jemalloc has been done by ceiling size to the closest size class. However, this causes lots of memory wastes with HPA enabled. This commit changes how usize is calculated so that the gap between two contiguous usize is no larger than a page. Specifically, this commit includes the following changes: 1. Adding a build-time config option (--enable-limit-usize-gap) and a runtime one (limit_usize_gap) to guard the changes. When build-time config is enabled, some minor CPU overhead is expected because usize will be stored and accessed apart from index. When runtime option is also enabled (it can only be enabled with the build-time config enabled). a new usize calculation approach wil be employed. This new calculation will ceil size to the closest multiple of PAGE for all sizes larger than USIZE_GROW_SLOW_THRESHOLD instead of using the size classes. Note when the build-time config is enabled, the runtime option is default on. 2. Prepare tcache for size to grow by PAGE over GROUP*PAGE. To prepare for the upcoming changes where size class grows by PAGE when larger than NGROUP * PAGE, disable the tcache when it is larger than 2 * NGROUP * PAGE. The threshold for tcache is set higher to prevent perf regression as much as possible while usizes between NGROUP * PAGE and 2 * NGROUP * PAGE happen to grow by PAGE. 3. Prepare pac and hpa psset for size to grow by PAGE over GROUP*PAGE For PAC, to avoid having too many bins, arena bins still have the same layout. This means some extra search is needed for a page-level request that is not aligned with the orginal size class: it should also search the heap before the current index since the previous heap might also be able to have some allocations satisfying it. The same changes apply to HPA's psset. This search relies on the enumeration of the heap because not all allocs in the previous heap are guaranteed to satisfy the request. To balance the memory and CPU overhead, we currently enumerate at most a fixed number of nodes before concluding none can satisfy the request during an enumeration. 4. Add bytes counter to arena large stats. To prepare for the upcoming usize changes, stats collected by multiplying alive allocations and the bin size is no longer accurate. Thus, add separate counters to record the bytes malloced and dalloced. 5. Change structs use when freeing to avoid using index2size for large sizes. - Change the definition of emap_alloc_ctx_t - Change the read of both from edata_t. - Change the assignment and usage of emap_alloc_ctx_t. - Change other callsites of index2size. Note for the changes in the data structure, i.e., emap_alloc_ctx_t, will be used when the build-time config (--enable-limit-usize-gap) is enabled but they will store the same value as index2size(szind) if the runtime option (opt_limit_usize_gap) is not enabled. 6. Adapt hpa to the usize changes. Change the settings in sec to limit is usage for sizes larger than USIZE_GROW_SLOW_THRESHOLD and modify corresponding tests. 7. Modify usize calculation and corresponding tests. Change the sz_s2u_compute. Note sz_index2size is not always safe now while sz_size2index still works as expected.
2026-07-15 13:17:17 +03:00 · 2024-03-26 14:35:29 -07:00 · 2024-03-26 14:35:29 -07:00 · c067a55c79
commit c067a55c79
parent ac279d7e71
33 changed files with 713 additions and 74 deletions
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -51,7 +51,7 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 }

 JEMALLOC_ALWAYS_INLINE bool
-large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) {
+large_dalloc_safety_checks(edata_t *edata, const void *ptr, size_t input_size) {
 	if (!config_opt_safety_checks) {
 		return false;
 	}
@ -68,7 +68,6 @@ large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) {
 		    "possibly caused by double free bugs.", ptr);
 		return true;
 	}
-	size_t input_size = sz_index2size(szind);
 	if (unlikely(input_size != edata_usize_get(edata))) {
 		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
 		    /* true_size */ edata_usize_get(edata), input_size);
@ -101,9 +100,10 @@ arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
 	if (unlikely(!is_slab)) {
 		/* edata must have been initialized at this point. */
 		assert(edata != NULL);
+		size_t usize = (alloc_ctx == NULL)? edata_usize_get(edata):
+		    emap_alloc_ctx_usize_get(alloc_ctx);
 		if (reset_recent &&
-		    large_dalloc_safety_checks(edata, ptr,
-		    edata_szind_get(edata))) {
+		    large_dalloc_safety_checks(edata, ptr, usize)) {
 			prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
 			return;
 		}
@ -225,7 +225,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);

-	return sz_index2size(alloc_ctx.szind);
+	return emap_alloc_ctx_usize_get(&alloc_ctx);
 }

 JEMALLOC_ALWAYS_INLINE size_t
@ -256,17 +256,24 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {

 	assert(full_alloc_ctx.szind != SC_NSIZES);

-	return sz_index2size(full_alloc_ctx.szind);
+	return edata_usize_get(full_alloc_ctx.edata);
 }

 static inline void
-arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
+arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind,
+    size_t usize) {
+	/*
+	 * szind is still needed in this function mainly becuase
+	 * szind < SC_NBINS determines not only if this is a small alloc,
+	 * but also if szind is valid (an inactive extent would have
+	 * szind == SC_NSIZES).
+	 */
 	if (config_prof && unlikely(szind < SC_NBINS)) {
 		arena_dalloc_promoted(tsdn, ptr, NULL, true);
 	} else {
 		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
 		    ptr);
-		if (large_dalloc_safety_checks(edata, ptr, szind)) {
+		if (large_dalloc_safety_checks(edata, ptr, usize)) {
 			/* See the comment in isfree. */
 			return;
 		}
@ -287,19 +294,22 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }

 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
-    bool slow_path) {
+    size_t usize, bool slow_path) {
 	assert (!tsdn_null(tsdn) && tcache != NULL);
 	bool is_sample_promoted = config_prof && szind < SC_NBINS;
 	if (unlikely(is_sample_promoted)) {
@ -313,7 +323,7 @@ arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
 		} else {
 			edata_t *edata = emap_edata_lookup(tsdn,
 			    &arena_emap_global, ptr);
-			if (large_dalloc_safety_checks(edata, ptr, szind)) {
+			if (large_dalloc_safety_checks(edata, ptr, usize)) {
 				/* See the comment in isfree. */
 				return;
 			}
@ -396,6 +406,8 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
@ -407,7 +419,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		    alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    emap_alloc_ctx_usize_get(&alloc_ctx), slow_path);
 	}
 }

@ -422,8 +434,9 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * There is no risk of being confused by a promoted sampled
 		 * object, so base szind and slab on the given size.
 		 */
-		alloc_ctx.szind = sz_size2index(size);
-		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+		szind_t szind = sz_size2index(size);
+		emap_alloc_ctx_init(&alloc_ctx, szind, (szind < SC_NBINS),
+		    size);
 	}

 	if ((config_prof && opt_prof) || config_debug) {
@ -446,7 +459,8 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }

@ -469,6 +483,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 			emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
 			    &alloc_ctx);
 			assert(alloc_ctx.szind == sz_size2index(size));
+			assert(emap_alloc_ctx_usize_get(&alloc_ctx) == size);
 		} else {
 			alloc_ctx = *caller_alloc_ctx;
 		}
@ -486,6 +501,11 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    ptr);
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		emap_alloc_ctx_init(&alloc_ctx, alloc_ctx.szind, alloc_ctx.slab,
+		    sz_s2u(size));
+		assert(!config_limit_usize_gap ||
+		    emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
@ -497,7 +517,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    sz_s2u(size), slow_path);
 	}
 }

--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@ -14,12 +14,18 @@ JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 typedef struct arena_stats_large_s arena_stats_large_t;
 struct arena_stats_large_s {
 	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the arena.
+	 * Total number of large allocation/deallocation requests served directly
+	 * by the arena.
 	 */
 	locked_u64_t	nmalloc;
 	locked_u64_t	ndalloc;

+	/*
+	 * Total large active bytes (allocated - deallocated) served directly
+	 * by the arena.
+	 */
+	locked_u64_t	active_bytes;
+
 	/*
 	 * Number of allocation requests that correspond to this size class.
 	 * This includes requests served by tcache, though tcache only
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@ -21,6 +21,14 @@
 */
 #define EDATA_ALIGNMENT 128

+/*
+ * Defines how many nodes visited when enumerating the heap to search for
+ * qualifed extents.  More nodes visited may result in better choices at
+ * the cost of longer search time.  This size should not exceed 2^16 - 1
+ * because we use uint16_t for accessing the queue needed for enumeration.
+ */
+#define ESET_ENUMERATE_MAX_NUM 32
+
 enum extent_state_e {
 	extent_state_active   = 0,
 	extent_state_dirty    = 1,
@ -89,8 +97,8 @@ struct edata_cmp_summary_s {

 /* Extent (span of pages).  Use accessor functions for e_* fields. */
 typedef struct edata_s edata_t;
-ph_structs(edata_avail, edata_t);
-ph_structs(edata_heap, edata_t);
+ph_structs(edata_avail, edata_t, ESET_ENUMERATE_MAX_NUM);
+ph_structs(edata_heap, edata_t, ESET_ENUMERATE_MAX_NUM);
 struct edata_s {
 	/*
 	 * Bitfield containing several fields:
@ -281,7 +289,54 @@ edata_szind_get(const edata_t *edata) {

 static inline size_t
 edata_usize_get(const edata_t *edata) {
-	return sz_index2size(edata_szind_get(edata));
+	assert(edata != NULL);
+	/*
+	 * When sz_limit_usize_gap_enabled() is true, two cases:
+	 * 1. if usize_from_ind is not smaller than SC_LARGE_MINCLASS,
+	 * usize_from_size is accurate;
+	 * 2. otherwise, usize_from_ind is accurate.
+	 *
+	 * When sz_limit_usize_gap_enabled() is not true, the two should be the
+	 * same when usize_from_ind is not smaller than SC_LARGE_MINCLASS.
+	 *
+	 * Note sampled small allocs will be promoted.  Their extent size is
+	 * recorded in edata_size_get(edata), while their szind reflects the
+	 * true usize.  Thus, usize retrieved here is still accurate for
+	 * sampled small allocs.
+	 */
+	szind_t szind = edata_szind_get(edata);
+#ifdef JEMALLOC_JET
+	/*
+	 * Double free is invalid and results in undefined behavior.  However,
+	 * for double free tests to end gracefully, return an invalid usize
+	 * when szind shows the edata is not active, i.e., szind == SC_NSIZES.
+	 */
+	if (unlikely(szind == SC_NSIZES)) {
+		return SC_LARGE_MAXCLASS + 1;
+	}
+#endif
+
+	if (!sz_limit_usize_gap_enabled() || szind < SC_NBINS) {
+		size_t usize_from_ind = sz_index2size(szind);
+		if (!sz_limit_usize_gap_enabled() &&
+		    usize_from_ind >= SC_LARGE_MINCLASS) {
+			size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+			assert(size > sz_large_pad);
+			size_t usize_from_size = size - sz_large_pad;
+			assert(usize_from_ind == usize_from_size);
+		}
+		return usize_from_ind;
+	}
+
+	size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+	assert(size > sz_large_pad);
+	size_t usize_from_size = size - sz_large_pad;
+	/*
+	 * no matter limit-usize-gap enabled or not, usize retrieved from size
+	 * is not accurate when smaller than SC_LARGE_MINCLASS.
+	 */
+	assert(usize_from_size >= SC_LARGE_MINCLASS);
+	return usize_from_size;
 }

 static inline unsigned
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@ -20,8 +20,9 @@ struct emap_s {
 };

 /* Used to pass rtree lookup context down the path. */
-typedef struct emap_alloc_ctx_t emap_alloc_ctx_t;
-struct emap_alloc_ctx_t {
+typedef struct emap_alloc_ctx_s emap_alloc_ctx_t;
+struct emap_alloc_ctx_s {
+	size_t usize;
 	szind_t szind;
 	bool slab;
 };
@ -230,16 +231,66 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
 	return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
 }

+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_init(emap_alloc_ctx_t *alloc_ctx, szind_t szind, bool slab,
+    size_t usize) {
+	alloc_ctx->szind = szind;
+	alloc_ctx->slab = slab;
+	/*
+	 * When config_limit_usize_gap disabled, alloc_ctx->usize
+	 * should not be accessed.
+	 */
+	if (config_limit_usize_gap) {
+		alloc_ctx->usize = usize;
+		assert(sz_limit_usize_gap_enabled() ||
+		    usize == sz_index2size(szind));
+	} else if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+emap_alloc_ctx_usize_get(emap_alloc_ctx_t *alloc_ctx) {
+	assert(alloc_ctx->szind < SC_NSIZES);
+	if (!config_limit_usize_gap || alloc_ctx->slab) {
+		assert(!config_limit_usize_gap ||
+		    alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+		return sz_index2size(alloc_ctx->szind);
+	}
+	assert(sz_limit_usize_gap_enabled() ||
+	    alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+	assert(alloc_ctx->usize <= SC_LARGE_MAXCLASS);
+	return alloc_ctx->usize;
+}
+
 /* Fills in alloc_ctx with the info in the map. */
 JEMALLOC_ALWAYS_INLINE void
 emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
    emap_alloc_ctx_t *alloc_ctx) {
 	EMAP_DECLARE_RTREE_CTX;

-	rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
-	    rtree_ctx, (uintptr_t)ptr);
-	alloc_ctx->szind = metadata.szind;
-	alloc_ctx->slab = metadata.slab;
+	if (config_limit_usize_gap) {
+		rtree_contents_t contents = rtree_read(tsdn, &emap->rtree,
+		    rtree_ctx, (uintptr_t)ptr);
+		/*
+		 * If the alloc is invalid, do not calculate usize since edata
+		 * could be corrupted.
+		 */
+		if (contents.metadata.szind == SC_NSIZES ||
+		    contents.edata == NULL) {
+			emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+			    contents.metadata.slab, 0);
+			return;
+		}
+		emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+		    contents.metadata.slab, edata_usize_get(contents.edata));
+	} else {
+		rtree_metadata_t metadata = rtree_metadata_read(tsdn,
+		    &emap->rtree, rtree_ctx, (uintptr_t)ptr);
+		/* alloc_ctx->usize will not be read/write in this case. */
+		emap_alloc_ctx_init(alloc_ctx, metadata.szind, metadata.slab,
+		    SC_LARGE_MAXCLASS + 1);
+	}
 }

 /* The pointer must be mapped. */
@ -293,8 +344,15 @@ emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr,
 	if (err) {
 		return true;
 	}
+	/*
+	 * Small allocs using the fastpath can always use index to get the
+	 * usize.  Therefore, do not set alloc_ctx->usize here.
+	 */
 	alloc_ctx->szind = metadata.szind;
 	alloc_ctx->slab = metadata.slab;
+	if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
 	return false;
 }

--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -20,8 +20,14 @@
 * an observable property of any given region of address space).  It's just
 * hugepage-sized and hugepage-aligned; it's *potentially* huge.
 */
+
+/*
+ * The max enumeration num should not exceed 2^16 - 1, see comments in edata.h
+ * for ESET_ENUMERATE_MAX_NUM for more details.
+ */
+#define PSSET_ENUMERATE_MAX_NUM 32
 typedef struct hpdata_s hpdata_t;
-ph_structs(hpdata_age_heap, hpdata_t);
+ph_structs(hpdata_age_heap, hpdata_t, PSSET_ENUMERATE_MAX_NUM);
 struct hpdata_s {
 	/*
 	 * We likewise follow the edata convention of mangling names and forcing
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -475,6 +475,12 @@
 /* If defined, use __int128 for optimization. */
 #undef JEMALLOC_HAVE_INT128

+/*
+ * If defined, the gap between any two contiguous usizes should not exceed
+ * PAGE.
+ */
+#undef LIMIT_USIZE_GAP
+
 #include "jemalloc/internal/jemalloc_internal_overrides.h"

 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@ -39,6 +39,7 @@ extern atomic_zu_t zero_realloc_count;
 extern bool opt_cache_oblivious;
 extern unsigned opt_debug_double_free_max_scan;
 extern size_t opt_calloc_madvise_threshold;
+extern bool opt_limit_usize_gap;

 extern const char *opt_malloc_conf_symlink;
 extern const char *opt_malloc_conf_env_var;
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@ -425,8 +425,9 @@ maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) {
                if (alloc_ctx->szind != dbg_ctx.szind) {
                        safety_check_fail_sized_dealloc(
                            /* current_dealloc */ true, ptr,
-                            /* true_size */ sz_index2size(dbg_ctx.szind),
-                            /* input_size */ sz_index2size(alloc_ctx->szind));
+                            /* true_size */ emap_alloc_ctx_usize_get(&dbg_ctx),
+                            /* input_size */ emap_alloc_ctx_usize_get(
+                            alloc_ctx));
                        return true;
                }
                if (alloc_ctx->slab != dbg_ctx.slab) {
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@ -276,4 +276,12 @@ static const bool have_memcntl =
 #endif
    ;

+static const bool config_limit_usize_gap =
+#ifdef LIMIT_USIZE_GAP
+    true
+#else
+    false
+#endif
+    ;
+
 #endif /* JEMALLOC_PREAMBLE_H */
--- a/include/jemalloc/internal/ph.h
+++ b/include/jemalloc/internal/ph.h
@ -75,6 +75,16 @@ struct ph_s {
 	size_t auxcount;
 };

+typedef struct ph_enumerate_vars_s ph_enumerate_vars_t;
+struct ph_enumerate_vars_s {
+	uint16_t front;
+	uint16_t rear;
+	uint16_t queue_size;
+	uint16_t visited_num;
+	uint16_t max_visit_num;
+	uint16_t max_queue_size;
+};
+
 JEMALLOC_ALWAYS_INLINE phn_link_t *
 phn_link_get(void *phn, size_t offset) {
 	return (phn_link_t *)(((char *)phn) + offset);
@ -414,14 +424,98 @@ ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
 	}
 }

-#define ph_structs(a_prefix, a_type)					\
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_vars_init(ph_enumerate_vars_t *vars, uint16_t max_visit_num,
+    uint16_t max_queue_size) {
+	vars->queue_size = 0;
+	vars->visited_num = 0;
+	vars->front = 0;
+	vars->rear = 0;
+	vars->max_visit_num = max_visit_num;
+	vars->max_queue_size = max_queue_size;
+	assert(vars->max_visit_num > 0);
+	/*
+	 * max_queue_size must be able to support max_visit_num, which means
+	 * the queue will not overflow before reaching max_visit_num.
+	 */
+	assert(vars->max_queue_size >= (vars->max_visit_num + 1)/2);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_queue_push(void *phn, void **bfs_queue,
+    ph_enumerate_vars_t *vars) {
+	assert(vars->queue_size < vars->max_queue_size);
+	bfs_queue[vars->rear] = phn;
+	vars->rear = (vars->rear + 1) % vars->max_queue_size;
+	(vars->queue_size) ++;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_enumerate_queue_pop(void **bfs_queue, ph_enumerate_vars_t *vars) {
+	assert(vars->queue_size > 0);
+	assert(vars->queue_size <= vars->max_queue_size);
+	void *ret = bfs_queue[vars->front];
+	vars->front = (vars->front + 1) % vars->max_queue_size;
+	(vars->queue_size) --;
+	return ret;
+}
+
+
+/*
+ * The two functions below offer a solution to enumerate the pairing heap.
+ * Whe enumerating, always call ph_enumerate_prepare first to prepare the queue
+ * needed for BFS.  Next, call ph_enumerate_next to get the next element in
+ * the enumeration.  When enumeration ends, ph_enumerate_next returns NULL and
+ * should not be called again.  Enumeration ends when all elements in the heap
+ * has been enumerated or the number of visited elements exceed
+ * max_visit_num.
+ */
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_prepare(ph_t *ph, void **bfs_queue, ph_enumerate_vars_t *vars,
+    uint16_t max_visit_num, uint16_t max_queue_size) {
+	ph_enumerate_vars_init(vars, max_visit_num, max_queue_size);
+	ph_enumerate_queue_push(ph->root, bfs_queue, vars);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_enumerate_next(ph_t *ph, size_t offset, void **bfs_queue,
+    ph_enumerate_vars_t *vars) {
+	if (vars->queue_size == 0) {
+		return NULL;
+	}
+
+	(vars->visited_num) ++;
+	if (vars->visited_num > vars->max_visit_num) {
+		return NULL;
+	}
+
+	void *ret = ph_enumerate_queue_pop(bfs_queue, vars);
+	assert(ret != NULL);
+	void *left = phn_lchild_get(ret, offset);
+	void *right = phn_next_get(ret, offset);
+	if (left) {
+		ph_enumerate_queue_push(left, bfs_queue, vars);
+	}
+	if (right) {
+		ph_enumerate_queue_push(right, bfs_queue, vars);
+	}
+	return ret;
+}
+
+#define ph_structs(a_prefix, a_type, a_max_queue_size)			\
 typedef struct {							\
 	phn_link_t link;						\
 } a_prefix##_link_t;							\
 									\
 typedef struct {							\
 	ph_t ph;							\
-} a_prefix##_t;
+} a_prefix##_t;								\
+									\
+typedef struct {							\
+	void *bfs_queue[a_max_queue_size];				\
+	ph_enumerate_vars_t vars;					\
+} a_prefix##_enumerate_helper_t;
+

 /*
 * The ph_proto() macro generates function prototypes that correspond to the
@ -436,7 +530,12 @@ a_attr a_type *a_prefix##_any(a_prefix##_t *ph);			\
 a_attr void a_prefix##_insert(a_prefix##_t *ph, a_type *phn);		\
 a_attr a_type *a_prefix##_remove_first(a_prefix##_t *ph);		\
 a_attr void a_prefix##_remove(a_prefix##_t *ph, a_type *phn);		\
-a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph);
+a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph);			\
+a_attr void a_prefix##_enumerate_prepare(a_prefix##_t *ph,		\
+    a_prefix##_enumerate_helper_t *helper, uint16_t max_visit_num,	\
+    uint16_t max_queue_size);						\
+a_attr a_type *a_prefix##_enumerate_next(a_prefix##_t *ph,		\
+    a_prefix##_enumerate_helper_t *helper);

 /* The ph_gen() macro generates a type-specific pairing heap implementation. */
 #define ph_gen(a_attr, a_prefix, a_type, a_field, a_cmp)		\
@ -491,6 +590,21 @@ a_prefix##_remove_any(a_prefix##_t *ph) {				\
 		a_prefix##_remove(ph, ret);				\
 	}								\
 	return ret;							\
+}									\
+									\
+a_attr void								\
+a_prefix##_enumerate_prepare(a_prefix##_t *ph,				\
+    a_prefix##_enumerate_helper_t *helper, uint16_t max_visit_num,	\
+    uint16_t max_queue_size) {						\
+	ph_enumerate_prepare(&ph->ph, helper->bfs_queue, &helper->vars,	\
+	    max_visit_num, max_queue_size);				\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_enumerate_next(a_prefix##_t *ph,				\
+    a_prefix##_enumerate_helper_t *helper) {				\
+	return ph_enumerate_next(&ph->ph, offsetof(a_type, a_field),	\
+	    helper->bfs_queue, &helper->vars);				\
 }

 #endif /* JEMALLOC_INTERNAL_PH_H */
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@ -286,6 +286,24 @@
 #  endif
 #endif

+/*
+ * When config_limit_usize_gap is enabled, the gaps between two contiguous
+ * size classes should not exceed PAGE.  This means there should be no concept
+ * of size classes for sizes > SC_SMALL_MAXCLASS (or >= SC_LARGE_MINCLASS).
+ * However, between SC_LARGE_MINCLASS (SC_NGROUP * PAGE) and
+ * 2 * SC_NGROUP * PAGE, the size class also happens to be aligned with PAGE.
+ * Since tcache relies on size classes to work and it greatly increases the
+ * perf of allocs & deallocs, we extend the existence of size class to
+ * 2 * SC_NGROUP * PAGE ONLY for the tcache module.  This means for all other
+ * modules, there is no size class for sizes >= SC_LARGE_MINCLASS.  Yet for
+ * tcache, the threshold is moved up to 2 * SC_NGROUP * PAGE, which is
+ * USIZE_GROW_SLOW_THRESHOLD defined below.  With the default SC_NGROUP being
+ * 2, and PAGE being 4KB, the threshold for tcache (USIZE_GROW_SLOW_THRESHOLD)
+ * is 32KB.
+ */
+#define LG_USIZE_GROW_SLOW_THRESHOLD (SC_LG_NGROUP + LG_PAGE + 1)
+#define USIZE_GROW_SLOW_THRESHOLD (1U << LG_USIZE_GROW_SLOW_THRESHOLD)
+
 #define SC_SLAB_MAXREGS (1U << SC_LG_SLAB_MAXREGS)

 typedef struct sc_s sc_t;
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@ -54,6 +54,15 @@ extern size_t sz_large_pad;

 extern void sz_boot(const sc_data_t *sc_data, bool cache_oblivious);

+JEMALLOC_ALWAYS_INLINE bool
+sz_limit_usize_gap_enabled() {
+#ifdef LIMIT_USIZE_GAP
+	return opt_limit_usize_gap;
+#else
+	return false;
+#endif
+}
+
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
 	assert(psz > 0);
@ -257,11 +266,34 @@ sz_index2size_lookup(szind_t index) {
 }

 JEMALLOC_ALWAYS_INLINE size_t
-sz_index2size(szind_t index) {
+sz_index2size_unsafe(szind_t index) {
 	assert(index < SC_NSIZES);
 	return sz_index2size_lookup(index);
 }

+JEMALLOC_ALWAYS_INLINE size_t
+sz_index2size(szind_t index) {
+	assert(!sz_limit_usize_gap_enabled() ||
+	    index <= sz_size2index(USIZE_GROW_SLOW_THRESHOLD));
+	size_t size = sz_index2size_unsafe(index);
+	/*
+	 * With limit_usize_gap enabled, the usize above
+	 * SC_LARGE_MINCLASS should grow by PAGE.  However, for sizes
+	 * in [SC_LARGE_MINCLASS, USIZE_GROW_SLOW_THRESHOLD], the
+	 * usize would not change because the size class gap in this
+	 * range is just the same as PAGE.  Although we use
+	 * SC_LARGE_MINCLASS as the threshold in most places, we
+	 * allow tcache and sec to cache up to
+	 * USIZE_GROW_SLOW_THRESHOLD to minimize the side effect of
+	 * not having size classes for larger sizes.  Thus, we assert
+	 * the size is no larger than USIZE_GROW_SLOW_THRESHOLD here
+	 * instead of SC_LARGE_MINCLASS.
+	 */
+	assert(!sz_limit_usize_gap_enabled() ||
+	    size <= USIZE_GROW_SLOW_THRESHOLD);
+	return size;
+}
+
 JEMALLOC_ALWAYS_INLINE void
 sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) {
 	if (util_compile_time_const(size)) {
@ -296,7 +328,7 @@ sz_s2u_compute(size_t size) {
 		    (ZU(1) << lg_ceil));
 	}
 #endif
-	{
+	if (size <= SC_SMALL_MAXCLASS || !sz_limit_usize_gap_enabled()) {
 		size_t x = lg_floor((size<<1)-1);
 		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
 		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
@ -304,11 +336,22 @@ sz_s2u_compute(size_t size) {
 		size_t delta_mask = delta - 1;
 		size_t usize = (size + delta_mask) & ~delta_mask;
 		return usize;
+	} else {
+		/*
+		 * With sz_limit_usize_gap_enabled() == true, usize of a large
+		 * allocation is calculated by ceiling size to the smallest
+		 * multiple of PAGE to minimize the memory overhead, especially
+		 * when using hugepages.
+		 */
+		size_t usize = PAGE_CEILING(size);
+		assert(usize - size < PAGE);
+		return usize;
 	}
 }

 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_lookup(size_t size) {
+	assert(!config_limit_usize_gap || size < SC_LARGE_MINCLASS);
 	size_t ret = sz_index2size_lookup(sz_size2index_lookup(size));

 	assert(ret == sz_s2u_compute(size));
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@ -19,7 +19,11 @@ typedef struct tcaches_s tcaches_t;
 /* NOLINTNEXTLINE(performance-no-int-to-ptr) */
 #define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1)

-#define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_max = 8M */
+#ifdef LIMIT_USIZE_GAP
+    #define TCACHE_LG_MAXCLASS_LIMIT LG_USIZE_GROW_SLOW_THRESHOLD
+#else
+    #define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_max = 8M */
+#endif
 #define TCACHE_MAXCLASS_LIMIT ((size_t)1 << TCACHE_LG_MAXCLASS_LIMIT)
 #define TCACHE_NBINS_MAX (SC_NBINS + SC_NGROUP *			\
    (TCACHE_LG_MAXCLASS_LIMIT - SC_LG_LARGE_MINCLASS) + 1)