diff --git a/configure.ac b/configure.ac
index b01ff56b..a55a5a08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2732,6 +2732,24 @@ if test "x${have_pthread}" = "x1" -a "x${je_cv_os_unfair_lock}" != "xyes" -a \
   AC_DEFINE([JEMALLOC_BACKGROUND_THREAD], [ ], [ ])
 fi
 
+dnl ============================================================================
+dnl Limit the gap between two contiguous usizes to be at most PAGE.
+AC_ARG_ENABLE([limit_usize_gap],
+  [AS_HELP_STRING([--enable-limit-usize-gap],
+                  [Limit the gap between two contiguous usizes])],
+[if test "x$limit_usize_gap" = "xno" ; then
+  limit_usize_gap="0"
+else
+  limit_usize_gap="1"
+fi
+],
+[limit_usize_gap="0"]
+)
+if test "x$limit_usize_gap" = "x1" ; then
+  AC_DEFINE([LIMIT_USIZE_GAP], [ ])
+fi
+AC_SUBST([limit_usize_gap])
+
 dnl ============================================================================
 dnl Check for glibc malloc hooks
 
@@ -2997,4 +3015,5 @@ AC_MSG_RESULT([cxx                : ${enable_cxx}])
 AC_MSG_RESULT([dss                : ${enable_dss}])
 AC_MSG_RESULT([tsan               : ${enable_tsan}])
 AC_MSG_RESULT([ubsan              : ${enable_ubsan}])
+AC_MSG_RESULT([limit-usize-gap    : ${limit_usize_gap}])
 AC_MSG_RESULT([===============================================================================])
diff --git a/include/jemalloc/internal/arena_inlines_b.h b/include/jemalloc/internal/arena_inlines_b.h
index ea246cc5..108493f2 100644
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@@ -51,7 +51,7 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 }
 
 JEMALLOC_ALWAYS_INLINE bool
-large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) {
+large_dalloc_safety_checks(edata_t *edata, const void *ptr, size_t input_size) {
 	if (!config_opt_safety_checks) {
 		return false;
 	}
@@ -68,7 +68,6 @@ large_dalloc_safety_checks(edata_t *edata, const void *ptr, szind_t szind) {
 		    "possibly caused by double free bugs.", ptr);
 		return true;
 	}
-	size_t input_size = sz_index2size(szind);
 	if (unlikely(input_size != edata_usize_get(edata))) {
 		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
 		    /* true_size */ edata_usize_get(edata), input_size);
@@ -101,9 +100,10 @@ arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
 	if (unlikely(!is_slab)) {
 		/* edata must have been initialized at this point. */
 		assert(edata != NULL);
+		size_t usize = (alloc_ctx == NULL)? edata_usize_get(edata):
+		    emap_alloc_ctx_usize_get(alloc_ctx);
 		if (reset_recent &&
-		    large_dalloc_safety_checks(edata, ptr,
-		    edata_szind_get(edata))) {
+		    large_dalloc_safety_checks(edata, ptr, usize)) {
 			prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
 			return;
 		}
@@ -225,7 +225,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);
 
-	return sz_index2size(alloc_ctx.szind);
+	return emap_alloc_ctx_usize_get(&alloc_ctx);
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
@@ -256,17 +256,24 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 
 	assert(full_alloc_ctx.szind != SC_NSIZES);
 
-	return sz_index2size(full_alloc_ctx.szind);
+	return edata_usize_get(full_alloc_ctx.edata);
 }
 
 static inline void
-arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
+arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind,
+    size_t usize) {
+	/*
+	 * szind is still needed in this function mainly becuase
+	 * szind < SC_NBINS determines not only if this is a small alloc,
+	 * but also if szind is valid (an inactive extent would have
+	 * szind == SC_NSIZES).
+	 */
 	if (config_prof && unlikely(szind < SC_NBINS)) {
 		arena_dalloc_promoted(tsdn, ptr, NULL, true);
 	} else {
 		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
 		    ptr);
-		if (large_dalloc_safety_checks(edata, ptr, szind)) {
+		if (large_dalloc_safety_checks(edata, ptr, usize)) {
 			/* See the comment in isfree. */
 			return;
 		}
@@ -287,19 +294,22 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}
 
 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
-    bool slow_path) {
+    size_t usize, bool slow_path) {
 	assert (!tsdn_null(tsdn) && tcache != NULL);
 	bool is_sample_promoted = config_prof && szind < SC_NBINS;
 	if (unlikely(is_sample_promoted)) {
@@ -313,7 +323,7 @@ arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
 		} else {
 			edata_t *edata = emap_edata_lookup(tsdn,
 			    &arena_emap_global, ptr);
-			if (large_dalloc_safety_checks(edata, ptr, szind)) {
+			if (large_dalloc_safety_checks(edata, ptr, usize)) {
 				/* See the comment in isfree. */
 				return;
 			}
@@ -396,6 +406,8 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}
 
 	if (likely(alloc_ctx.slab)) {
@@ -407,7 +419,7 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 		    alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    emap_alloc_ctx_usize_get(&alloc_ctx), slow_path);
 	}
 }
 
@@ -422,8 +434,9 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * There is no risk of being confused by a promoted sampled
 		 * object, so base szind and slab on the given size.
 		 */
-		alloc_ctx.szind = sz_size2index(size);
-		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+		szind_t szind = sz_size2index(size);
+		emap_alloc_ctx_init(&alloc_ctx, szind, (szind < SC_NBINS),
+		    size);
 	}
 
 	if ((config_prof && opt_prof) || config_debug) {
@@ -446,7 +459,8 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }
 
@@ -469,6 +483,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 			emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
 			    &alloc_ctx);
 			assert(alloc_ctx.szind == sz_size2index(size));
+			assert(emap_alloc_ctx_usize_get(&alloc_ctx) == size);
 		} else {
 			alloc_ctx = *caller_alloc_ctx;
 		}
@@ -486,6 +501,11 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    ptr);
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		emap_alloc_ctx_init(&alloc_ctx, alloc_ctx.szind, alloc_ctx.slab,
+		    sz_s2u(size));
+		assert(!config_limit_usize_gap ||
+		    emap_alloc_ctx_usize_get(&alloc_ctx) ==
+		    edata_usize_get(edata));
 	}
 
 	if (likely(alloc_ctx.slab)) {
@@ -497,7 +517,7 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 		    alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    sz_s2u(size), slow_path);
 	}
 }
 
diff --git a/include/jemalloc/internal/arena_stats.h b/include/jemalloc/internal/arena_stats.h
index 3d512630..7f075114 100644
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@@ -14,12 +14,18 @@ JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 typedef struct arena_stats_large_s arena_stats_large_t;
 struct arena_stats_large_s {
 	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the arena.
+	 * Total number of large allocation/deallocation requests served directly
+	 * by the arena.
 	 */
 	locked_u64_t	nmalloc;
 	locked_u64_t	ndalloc;
 
+	/*
+	 * Total large active bytes (allocated - deallocated) served directly
+	 * by the arena.
+	 */
+	locked_u64_t	active_bytes;
+
 	/*
 	 * Number of allocation requests that correspond to this size class.
 	 * This includes requests served by tcache, though tcache only
diff --git a/include/jemalloc/internal/edata.h b/include/jemalloc/internal/edata.h
index 2381ccbc..b087ea31 100644
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@@ -21,6 +21,14 @@
  */
 #define EDATA_ALIGNMENT 128
 
+/*
+ * Defines how many nodes visited when enumerating the heap to search for
+ * qualifed extents.  More nodes visited may result in better choices at
+ * the cost of longer search time.  This size should not exceed 2^16 - 1
+ * because we use uint16_t for accessing the queue needed for enumeration.
+ */
+#define ESET_ENUMERATE_MAX_NUM 32
+
 enum extent_state_e {
 	extent_state_active   = 0,
 	extent_state_dirty    = 1,
@@ -89,8 +97,8 @@ struct edata_cmp_summary_s {
 
 /* Extent (span of pages).  Use accessor functions for e_* fields. */
 typedef struct edata_s edata_t;
-ph_structs(edata_avail, edata_t);
-ph_structs(edata_heap, edata_t);
+ph_structs(edata_avail, edata_t, ESET_ENUMERATE_MAX_NUM);
+ph_structs(edata_heap, edata_t, ESET_ENUMERATE_MAX_NUM);
 struct edata_s {
 	/*
 	 * Bitfield containing several fields:
@@ -281,7 +289,54 @@ edata_szind_get(const edata_t *edata) {
 
 static inline size_t
 edata_usize_get(const edata_t *edata) {
-	return sz_index2size(edata_szind_get(edata));
+	assert(edata != NULL);
+	/*
+	 * When sz_limit_usize_gap_enabled() is true, two cases:
+	 * 1. if usize_from_ind is not smaller than SC_LARGE_MINCLASS,
+	 * usize_from_size is accurate;
+	 * 2. otherwise, usize_from_ind is accurate.
+	 *
+	 * When sz_limit_usize_gap_enabled() is not true, the two should be the
+	 * same when usize_from_ind is not smaller than SC_LARGE_MINCLASS.
+	 *
+	 * Note sampled small allocs will be promoted.  Their extent size is
+	 * recorded in edata_size_get(edata), while their szind reflects the
+	 * true usize.  Thus, usize retrieved here is still accurate for
+	 * sampled small allocs.
+	 */
+	szind_t szind = edata_szind_get(edata);
+#ifdef JEMALLOC_JET
+	/*
+	 * Double free is invalid and results in undefined behavior.  However,
+	 * for double free tests to end gracefully, return an invalid usize
+	 * when szind shows the edata is not active, i.e., szind == SC_NSIZES.
+	 */
+	if (unlikely(szind == SC_NSIZES)) {
+		return SC_LARGE_MAXCLASS + 1;
+	}
+#endif
+
+	if (!sz_limit_usize_gap_enabled() || szind < SC_NBINS) {
+		size_t usize_from_ind = sz_index2size(szind);
+		if (!sz_limit_usize_gap_enabled() &&
+		    usize_from_ind >= SC_LARGE_MINCLASS) {
+			size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+			assert(size > sz_large_pad);
+			size_t usize_from_size = size - sz_large_pad;
+			assert(usize_from_ind == usize_from_size);
+		}
+		return usize_from_ind;
+	}
+
+	size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+	assert(size > sz_large_pad);
+	size_t usize_from_size = size - sz_large_pad;
+	/*
+	 * no matter limit-usize-gap enabled or not, usize retrieved from size
+	 * is not accurate when smaller than SC_LARGE_MINCLASS.
+	 */
+	assert(usize_from_size >= SC_LARGE_MINCLASS);
+	return usize_from_size;
 }
 
 static inline unsigned
diff --git a/include/jemalloc/internal/emap.h b/include/jemalloc/internal/emap.h
index 7ac0ae95..5885daa6 100644
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@@ -20,8 +20,9 @@ struct emap_s {
 };
 
 /* Used to pass rtree lookup context down the path. */
-typedef struct emap_alloc_ctx_t emap_alloc_ctx_t;
-struct emap_alloc_ctx_t {
+typedef struct emap_alloc_ctx_s emap_alloc_ctx_t;
+struct emap_alloc_ctx_s {
+	size_t usize;
 	szind_t szind;
 	bool slab;
 };
@@ -230,16 +231,66 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
 	return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
 }
 
+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_init(emap_alloc_ctx_t *alloc_ctx, szind_t szind, bool slab,
+    size_t usize) {
+	alloc_ctx->szind = szind;
+	alloc_ctx->slab = slab;
+	/*
+	 * When config_limit_usize_gap disabled, alloc_ctx->usize
+	 * should not be accessed.
+	 */
+	if (config_limit_usize_gap) {
+		alloc_ctx->usize = usize;
+		assert(sz_limit_usize_gap_enabled() ||
+		    usize == sz_index2size(szind));
+	} else if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+emap_alloc_ctx_usize_get(emap_alloc_ctx_t *alloc_ctx) {
+	assert(alloc_ctx->szind < SC_NSIZES);
+	if (!config_limit_usize_gap || alloc_ctx->slab) {
+		assert(!config_limit_usize_gap ||
+		    alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+		return sz_index2size(alloc_ctx->szind);
+	}
+	assert(sz_limit_usize_gap_enabled() ||
+	    alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+	assert(alloc_ctx->usize <= SC_LARGE_MAXCLASS);
+	return alloc_ctx->usize;
+}
+
 /* Fills in alloc_ctx with the info in the map. */
 JEMALLOC_ALWAYS_INLINE void
 emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
     emap_alloc_ctx_t *alloc_ctx) {
 	EMAP_DECLARE_RTREE_CTX;
 
-	rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
-	    rtree_ctx, (uintptr_t)ptr);
-	alloc_ctx->szind = metadata.szind;
-	alloc_ctx->slab = metadata.slab;
+	if (config_limit_usize_gap) {
+		rtree_contents_t contents = rtree_read(tsdn, &emap->rtree,
+		    rtree_ctx, (uintptr_t)ptr);
+		/*
+		 * If the alloc is invalid, do not calculate usize since edata
+		 * could be corrupted.
+		 */
+		if (contents.metadata.szind == SC_NSIZES ||
+		    contents.edata == NULL) {
+			emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+			    contents.metadata.slab, 0);
+			return;
+		}
+		emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+		    contents.metadata.slab, edata_usize_get(contents.edata));
+	} else {
+		rtree_metadata_t metadata = rtree_metadata_read(tsdn,
+		    &emap->rtree, rtree_ctx, (uintptr_t)ptr);
+		/* alloc_ctx->usize will not be read/write in this case. */
+		emap_alloc_ctx_init(alloc_ctx, metadata.szind, metadata.slab,
+		    SC_LARGE_MAXCLASS + 1);
+	}
 }
 
 /* The pointer must be mapped. */
@@ -293,8 +344,15 @@ emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr,
 	if (err) {
 		return true;
 	}
+	/*
+	 * Small allocs using the fastpath can always use index to get the
+	 * usize.  Therefore, do not set alloc_ctx->usize here.
+	 */
 	alloc_ctx->szind = metadata.szind;
 	alloc_ctx->slab = metadata.slab;
+	if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
 	return false;
 }
 
diff --git a/include/jemalloc/internal/hpdata.h b/include/jemalloc/internal/hpdata.h
index 7ba92112..a8a845ec 100644
--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@@ -20,8 +20,14 @@
  * an observable property of any given region of address space).  It's just
  * hugepage-sized and hugepage-aligned; it's *potentially* huge.
  */
+
+/*
+ * The max enumeration num should not exceed 2^16 - 1, see comments in edata.h
+ * for ESET_ENUMERATE_MAX_NUM for more details.
+ */
+#define PSSET_ENUMERATE_MAX_NUM 32
 typedef struct hpdata_s hpdata_t;
-ph_structs(hpdata_age_heap, hpdata_t);
+ph_structs(hpdata_age_heap, hpdata_t, PSSET_ENUMERATE_MAX_NUM);
 struct hpdata_s {
 	/*
 	 * We likewise follow the edata convention of mangling names and forcing
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 742d599d..e76eaaf4 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -475,6 +475,12 @@
 /* If defined, use __int128 for optimization. */
 #undef JEMALLOC_HAVE_INT128
 
+/*
+ * If defined, the gap between any two contiguous usizes should not exceed
+ * PAGE.
+ */
+#undef LIMIT_USIZE_GAP
+
 #include "jemalloc/internal/jemalloc_internal_overrides.h"
 
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
diff --git a/include/jemalloc/internal/jemalloc_internal_externs.h b/include/jemalloc/internal/jemalloc_internal_externs.h
index 2c6b58f7..8c6df450 100644
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@@ -39,6 +39,7 @@ extern atomic_zu_t zero_realloc_count;
 extern bool opt_cache_oblivious;
 extern unsigned opt_debug_double_free_max_scan;
 extern size_t opt_calloc_madvise_threshold;
+extern bool opt_limit_usize_gap;
 
 extern const char *opt_malloc_conf_symlink;
 extern const char *opt_malloc_conf_env_var;
diff --git a/include/jemalloc/internal/jemalloc_internal_inlines_c.h b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
index 854aec1e..c7ef9161 100644
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@@ -425,8 +425,9 @@ maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) {
                 if (alloc_ctx->szind != dbg_ctx.szind) {
                         safety_check_fail_sized_dealloc(
                             /* current_dealloc */ true, ptr,
-                            /* true_size */ sz_index2size(dbg_ctx.szind),
-                            /* input_size */ sz_index2size(alloc_ctx->szind));
+                            /* true_size */ emap_alloc_ctx_usize_get(&dbg_ctx),
+                            /* input_size */ emap_alloc_ctx_usize_get(
+                            alloc_ctx));
                         return true;
                 }
                 if (alloc_ctx->slab != dbg_ctx.slab) {
diff --git a/include/jemalloc/internal/jemalloc_preamble.h.in b/include/jemalloc/internal/jemalloc_preamble.h.in
index a59c3489..ef637a2d 100644
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@@ -276,4 +276,12 @@ static const bool have_memcntl =
 #endif
     ;
 
+static const bool config_limit_usize_gap =
+#ifdef LIMIT_USIZE_GAP
+    true
+#else
+    false
+#endif
+    ;
+
 #endif /* JEMALLOC_PREAMBLE_H */
diff --git a/include/jemalloc/internal/ph.h b/include/jemalloc/internal/ph.h
index ef9634be..05376004 100644
--- a/include/jemalloc/internal/ph.h
+++ b/include/jemalloc/internal/ph.h
@@ -75,6 +75,16 @@ struct ph_s {
 	size_t auxcount;
 };
 
+typedef struct ph_enumerate_vars_s ph_enumerate_vars_t;
+struct ph_enumerate_vars_s {
+	uint16_t front;
+	uint16_t rear;
+	uint16_t queue_size;
+	uint16_t visited_num;
+	uint16_t max_visit_num;
+	uint16_t max_queue_size;
+};
+
 JEMALLOC_ALWAYS_INLINE phn_link_t *
 phn_link_get(void *phn, size_t offset) {
 	return (phn_link_t *)(((char *)phn) + offset);
@@ -414,14 +424,98 @@ ph_remove(ph_t *ph, void *phn, size_t offset, ph_cmp_t cmp) {
 	}
 }
 
-#define ph_structs(a_prefix, a_type)					\
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_vars_init(ph_enumerate_vars_t *vars, uint16_t max_visit_num,
+    uint16_t max_queue_size) {
+	vars->queue_size = 0;
+	vars->visited_num = 0;
+	vars->front = 0;
+	vars->rear = 0;
+	vars->max_visit_num = max_visit_num;
+	vars->max_queue_size = max_queue_size;
+	assert(vars->max_visit_num > 0);
+	/*
+	 * max_queue_size must be able to support max_visit_num, which means
+	 * the queue will not overflow before reaching max_visit_num.
+	 */
+	assert(vars->max_queue_size >= (vars->max_visit_num + 1)/2);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_queue_push(void *phn, void **bfs_queue,
+    ph_enumerate_vars_t *vars) {
+	assert(vars->queue_size < vars->max_queue_size);
+	bfs_queue[vars->rear] = phn;
+	vars->rear = (vars->rear + 1) % vars->max_queue_size;
+	(vars->queue_size) ++;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_enumerate_queue_pop(void **bfs_queue, ph_enumerate_vars_t *vars) {
+	assert(vars->queue_size > 0);
+	assert(vars->queue_size <= vars->max_queue_size);
+	void *ret = bfs_queue[vars->front];
+	vars->front = (vars->front + 1) % vars->max_queue_size;
+	(vars->queue_size) --;
+	return ret;
+}
+
+
+/*
+ * The two functions below offer a solution to enumerate the pairing heap.
+ * Whe enumerating, always call ph_enumerate_prepare first to prepare the queue
+ * needed for BFS.  Next, call ph_enumerate_next to get the next element in
+ * the enumeration.  When enumeration ends, ph_enumerate_next returns NULL and
+ * should not be called again.  Enumeration ends when all elements in the heap
+ * has been enumerated or the number of visited elements exceed
+ * max_visit_num.
+ */
+JEMALLOC_ALWAYS_INLINE void
+ph_enumerate_prepare(ph_t *ph, void **bfs_queue, ph_enumerate_vars_t *vars,
+    uint16_t max_visit_num, uint16_t max_queue_size) {
+	ph_enumerate_vars_init(vars, max_visit_num, max_queue_size);
+	ph_enumerate_queue_push(ph->root, bfs_queue, vars);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ph_enumerate_next(ph_t *ph, size_t offset, void **bfs_queue,
+    ph_enumerate_vars_t *vars) {
+	if (vars->queue_size == 0) {
+		return NULL;
+	}
+
+	(vars->visited_num) ++;
+	if (vars->visited_num > vars->max_visit_num) {
+		return NULL;
+	}
+
+	void *ret = ph_enumerate_queue_pop(bfs_queue, vars);
+	assert(ret != NULL);
+	void *left = phn_lchild_get(ret, offset);
+	void *right = phn_next_get(ret, offset);
+	if (left) {
+		ph_enumerate_queue_push(left, bfs_queue, vars);
+	}
+	if (right) {
+		ph_enumerate_queue_push(right, bfs_queue, vars);
+	}
+	return ret;
+}
+
+#define ph_structs(a_prefix, a_type, a_max_queue_size)			\
 typedef struct {							\
 	phn_link_t link;						\
 } a_prefix##_link_t;							\
 									\
 typedef struct {							\
 	ph_t ph;							\
-} a_prefix##_t;
+} a_prefix##_t;								\
+									\
+typedef struct {							\
+	void *bfs_queue[a_max_queue_size];				\
+	ph_enumerate_vars_t vars;					\
+} a_prefix##_enumerate_helper_t;
+
 
 /*
  * The ph_proto() macro generates function prototypes that correspond to the
@@ -436,7 +530,12 @@ a_attr a_type *a_prefix##_any(a_prefix##_t *ph);			\
 a_attr void a_prefix##_insert(a_prefix##_t *ph, a_type *phn);		\
 a_attr a_type *a_prefix##_remove_first(a_prefix##_t *ph);		\
 a_attr void a_prefix##_remove(a_prefix##_t *ph, a_type *phn);		\
-a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph);
+a_attr a_type *a_prefix##_remove_any(a_prefix##_t *ph);			\
+a_attr void a_prefix##_enumerate_prepare(a_prefix##_t *ph,		\
+    a_prefix##_enumerate_helper_t *helper, uint16_t max_visit_num,	\
+    uint16_t max_queue_size);						\
+a_attr a_type *a_prefix##_enumerate_next(a_prefix##_t *ph,		\
+    a_prefix##_enumerate_helper_t *helper);
 
 /* The ph_gen() macro generates a type-specific pairing heap implementation. */
 #define ph_gen(a_attr, a_prefix, a_type, a_field, a_cmp)		\
@@ -491,6 +590,21 @@ a_prefix##_remove_any(a_prefix##_t *ph) {				\
 		a_prefix##_remove(ph, ret);				\
 	}								\
 	return ret;							\
+}									\
+									\
+a_attr void								\
+a_prefix##_enumerate_prepare(a_prefix##_t *ph,				\
+    a_prefix##_enumerate_helper_t *helper, uint16_t max_visit_num,	\
+    uint16_t max_queue_size) {						\
+	ph_enumerate_prepare(&ph->ph, helper->bfs_queue, &helper->vars,	\
+	    max_visit_num, max_queue_size);				\
+}									\
+									\
+a_attr a_type *								\
+a_prefix##_enumerate_next(a_prefix##_t *ph,				\
+    a_prefix##_enumerate_helper_t *helper) {				\
+	return ph_enumerate_next(&ph->ph, offsetof(a_type, a_field),	\
+	    helper->bfs_queue, &helper->vars);				\
 }
 
 #endif /* JEMALLOC_INTERNAL_PH_H */
diff --git a/include/jemalloc/internal/sc.h b/include/jemalloc/internal/sc.h
index 770835cc..098e47b7 100644
--- a/include/jemalloc/internal/sc.h
+++ b/include/jemalloc/internal/sc.h
@@ -286,6 +286,24 @@
 #  endif
 #endif
 
+/*
+ * When config_limit_usize_gap is enabled, the gaps between two contiguous
+ * size classes should not exceed PAGE.  This means there should be no concept
+ * of size classes for sizes > SC_SMALL_MAXCLASS (or >= SC_LARGE_MINCLASS).
+ * However, between SC_LARGE_MINCLASS (SC_NGROUP * PAGE) and
+ * 2 * SC_NGROUP * PAGE, the size class also happens to be aligned with PAGE.
+ * Since tcache relies on size classes to work and it greatly increases the
+ * perf of allocs & deallocs, we extend the existence of size class to
+ * 2 * SC_NGROUP * PAGE ONLY for the tcache module.  This means for all other
+ * modules, there is no size class for sizes >= SC_LARGE_MINCLASS.  Yet for
+ * tcache, the threshold is moved up to 2 * SC_NGROUP * PAGE, which is
+ * USIZE_GROW_SLOW_THRESHOLD defined below.  With the default SC_NGROUP being
+ * 2, and PAGE being 4KB, the threshold for tcache (USIZE_GROW_SLOW_THRESHOLD)
+ * is 32KB.
+ */
+#define LG_USIZE_GROW_SLOW_THRESHOLD (SC_LG_NGROUP + LG_PAGE + 1)
+#define USIZE_GROW_SLOW_THRESHOLD (1U << LG_USIZE_GROW_SLOW_THRESHOLD)
+
 #define SC_SLAB_MAXREGS (1U << SC_LG_SLAB_MAXREGS)
 
 typedef struct sc_s sc_t;
diff --git a/include/jemalloc/internal/sz.h b/include/jemalloc/internal/sz.h
index a2d2debc..6c0a1f0c 100644
--- a/include/jemalloc/internal/sz.h
+++ b/include/jemalloc/internal/sz.h
@@ -54,6 +54,15 @@ extern size_t sz_large_pad;
 
 extern void sz_boot(const sc_data_t *sc_data, bool cache_oblivious);
 
+JEMALLOC_ALWAYS_INLINE bool
+sz_limit_usize_gap_enabled() {
+#ifdef LIMIT_USIZE_GAP
+	return opt_limit_usize_gap;
+#else
+	return false;
+#endif
+}
+
 JEMALLOC_ALWAYS_INLINE pszind_t
 sz_psz2ind(size_t psz) {
 	assert(psz > 0);
@@ -257,11 +266,34 @@ sz_index2size_lookup(szind_t index) {
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
-sz_index2size(szind_t index) {
+sz_index2size_unsafe(szind_t index) {
 	assert(index < SC_NSIZES);
 	return sz_index2size_lookup(index);
 }
 
+JEMALLOC_ALWAYS_INLINE size_t
+sz_index2size(szind_t index) {
+	assert(!sz_limit_usize_gap_enabled() ||
+	    index <= sz_size2index(USIZE_GROW_SLOW_THRESHOLD));
+	size_t size = sz_index2size_unsafe(index);
+	/*
+	 * With limit_usize_gap enabled, the usize above
+	 * SC_LARGE_MINCLASS should grow by PAGE.  However, for sizes
+	 * in [SC_LARGE_MINCLASS, USIZE_GROW_SLOW_THRESHOLD], the
+	 * usize would not change because the size class gap in this
+	 * range is just the same as PAGE.  Although we use
+	 * SC_LARGE_MINCLASS as the threshold in most places, we
+	 * allow tcache and sec to cache up to
+	 * USIZE_GROW_SLOW_THRESHOLD to minimize the side effect of
+	 * not having size classes for larger sizes.  Thus, we assert
+	 * the size is no larger than USIZE_GROW_SLOW_THRESHOLD here
+	 * instead of SC_LARGE_MINCLASS.
+	 */
+	assert(!sz_limit_usize_gap_enabled() ||
+	    size <= USIZE_GROW_SLOW_THRESHOLD);
+	return size;
+}
+
 JEMALLOC_ALWAYS_INLINE void
 sz_size2index_usize_fastpath(size_t size, szind_t *ind, size_t *usize) {
 	if (util_compile_time_const(size)) {
@@ -296,7 +328,7 @@ sz_s2u_compute(size_t size) {
 		    (ZU(1) << lg_ceil));
 	}
 #endif
-	{
+	if (size <= SC_SMALL_MAXCLASS || !sz_limit_usize_gap_enabled()) {
 		size_t x = lg_floor((size<<1)-1);
 		size_t lg_delta = (x < SC_LG_NGROUP + LG_QUANTUM + 1)
 		    ?  LG_QUANTUM : x - SC_LG_NGROUP - 1;
@@ -304,11 +336,22 @@ sz_s2u_compute(size_t size) {
 		size_t delta_mask = delta - 1;
 		size_t usize = (size + delta_mask) & ~delta_mask;
 		return usize;
+	} else {
+		/*
+		 * With sz_limit_usize_gap_enabled() == true, usize of a large
+		 * allocation is calculated by ceiling size to the smallest
+		 * multiple of PAGE to minimize the memory overhead, especially
+		 * when using hugepages.
+		 */
+		size_t usize = PAGE_CEILING(size);
+		assert(usize - size < PAGE);
+		return usize;
 	}
 }
 
 JEMALLOC_ALWAYS_INLINE size_t
 sz_s2u_lookup(size_t size) {
+	assert(!config_limit_usize_gap || size < SC_LARGE_MINCLASS);
 	size_t ret = sz_index2size_lookup(sz_size2index_lookup(size));
 
 	assert(ret == sz_s2u_compute(size));
diff --git a/include/jemalloc/internal/tcache_types.h b/include/jemalloc/internal/tcache_types.h
index eebad79f..f13ff748 100644
--- a/include/jemalloc/internal/tcache_types.h
+++ b/include/jemalloc/internal/tcache_types.h
@@ -19,7 +19,11 @@ typedef struct tcaches_s tcaches_t;
 /* NOLINTNEXTLINE(performance-no-int-to-ptr) */
 #define TCACHES_ELM_NEED_REINIT ((tcache_t *)(uintptr_t)1)
 
-#define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_max = 8M */
+#ifdef LIMIT_USIZE_GAP
+    #define TCACHE_LG_MAXCLASS_LIMIT LG_USIZE_GROW_SLOW_THRESHOLD
+#else
+    #define TCACHE_LG_MAXCLASS_LIMIT 23 /* tcache_max = 8M */
+#endif
 #define TCACHE_MAXCLASS_LIMIT ((size_t)1 << TCACHE_LG_MAXCLASS_LIMIT)
 #define TCACHE_NBINS_MAX (SC_NBINS + SC_NGROUP *			\
     (TCACHE_LG_MAXCLASS_LIMIT - SC_LG_LARGE_MINCLASS) + 1)
diff --git a/src/arena.c b/src/arena.c
index ab6006d7..54ecc403 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -145,8 +145,18 @@ arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
 		assert(nmalloc - ndalloc <= SIZE_T_MAX);
 		size_t curlextents = (size_t)(nmalloc - ndalloc);
 		lstats[i].curlextents += curlextents;
-		astats->allocated_large +=
-		    curlextents * sz_index2size(SC_NBINS + i);
+
+		if (config_limit_usize_gap) {
+			uint64_t active_bytes = locked_read_u64(tsdn,
+			    LOCKEDINT_MTX(arena->stats.mtx),
+			    &arena->stats.lstats[i].active_bytes);
+			locked_inc_u64_unsynchronized(
+			    &lstats[i].active_bytes, active_bytes);
+			astats->allocated_large += active_bytes;
+		} else {
+			astats->allocated_large +=
+			    curlextents * sz_index2size(SC_NBINS + i);
+		}
 	}
 
 	pa_shard_stats_merge(tsdn, &arena->pa_shard, &astats->pa_shard_stats,
@@ -315,6 +325,11 @@ arena_large_malloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 		LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx);
 		locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx),
 			&arena->stats.lstats[hindex].nmalloc, 1);
+		if (config_limit_usize_gap) {
+			locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx),
+			    &arena->stats.lstats[hindex].active_bytes,
+			    usize);
+		}
 		LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx);
 	}
 }
@@ -338,6 +353,11 @@ arena_large_dalloc_stats_update(tsdn_t *tsdn, arena_t *arena, size_t usize) {
 		LOCKEDINT_MTX_LOCK(tsdn, arena->stats.mtx);
 		locked_inc_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx),
 			&arena->stats.lstats[hindex].ndalloc, 1);
+		if (config_limit_usize_gap) {
+			locked_dec_u64(tsdn, LOCKEDINT_MTX(arena->stats.mtx),
+			    &arena->stats.lstats[hindex].active_bytes,
+			    usize);
+		}
 		LOCKEDINT_MTX_UNLOCK(tsdn, arena->stats.mtx);
 	}
 }
@@ -802,7 +822,7 @@ arena_reset(tsd_t *tsd, arena_t *arena) {
 		assert(alloc_ctx.szind != SC_NSIZES);
 
 		if (config_stats || (config_prof && opt_prof)) {
-			usize = sz_index2size(alloc_ctx.szind);
+			usize = emap_alloc_ctx_usize_get(&alloc_ctx);
 			assert(usize == isalloc(tsd_tsdn(tsd), ptr));
 		}
 		/* Remove large allocation from prof sample set. */
@@ -1346,7 +1366,7 @@ arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
 		assert(sz_can_use_slab(size));
 		return arena_malloc_small(tsdn, arena, ind, zero);
 	} else {
-		return large_malloc(tsdn, arena, sz_index2size(ind), zero);
+		return large_malloc(tsdn, arena, sz_s2u(size), zero);
 	}
 }
 
diff --git a/src/ctl.c b/src/ctl.c
index 1ebcbf8e..73d4cb66 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -168,6 +168,7 @@ CTL_PROTO(opt_prof_sys_thread_name)
 CTL_PROTO(opt_prof_time_res)
 CTL_PROTO(opt_lg_san_uaf_align)
 CTL_PROTO(opt_zero_realloc)
+CTL_PROTO(opt_limit_usize_gap)
 CTL_PROTO(opt_malloc_conf_symlink)
 CTL_PROTO(opt_malloc_conf_env_var)
 CTL_PROTO(opt_malloc_conf_global_var)
@@ -557,6 +558,7 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("zero_realloc"),	CTL(opt_zero_realloc)},
 	{NAME("debug_double_free_max_scan"),
 		CTL(opt_debug_double_free_max_scan)},
+	{NAME("limit_usize_gap"),	CTL(opt_limit_usize_gap)},
 	{NAME("malloc_conf"),	CHILD(named, opt_malloc_conf)}
 };
 
@@ -2341,6 +2343,8 @@ CTL_RO_NL_CGEN(config_uaf_detection, opt_lg_san_uaf_align,
     opt_lg_san_uaf_align, ssize_t)
 CTL_RO_NL_GEN(opt_zero_realloc,
     zero_realloc_mode_names[opt_zero_realloc_action], const char *)
+CTL_RO_NL_CGEN(config_limit_usize_gap, opt_limit_usize_gap, opt_limit_usize_gap,
+    bool)
 
 /* malloc_conf options */
 CTL_RO_NL_CGEN(opt_malloc_conf_symlink, opt_malloc_conf_symlink,
@@ -3364,8 +3368,8 @@ arenas_bin_i_index(tsdn_t *tsdn, const size_t *mib,
 }
 
 CTL_RO_NL_GEN(arenas_nlextents, SC_NSIZES - SC_NBINS, unsigned)
-CTL_RO_NL_GEN(arenas_lextent_i_size, sz_index2size(SC_NBINS+(szind_t)mib[2]),
-    size_t)
+CTL_RO_NL_GEN(arenas_lextent_i_size,
+    sz_index2size_unsafe(SC_NBINS+(szind_t)mib[2]), size_t)
 static const ctl_named_node_t *
 arenas_lextent_i_index(tsdn_t *tsdn, const size_t *mib,
     size_t miblen, size_t i) {
diff --git a/src/eset.c b/src/eset.c
index 6f8f335e..7dc9cce7 100644
--- a/src/eset.c
+++ b/src/eset.c
@@ -155,6 +155,71 @@ eset_remove(eset_t *eset, edata_t *edata) {
 	    cur_extents_npages - (size >> LG_PAGE), ATOMIC_RELAXED);
 }
 
+edata_t *
+eset_enumerate_alignment_search(eset_t *eset, size_t size, pszind_t bin_ind,
+    size_t alignment) {
+	if (edata_heap_empty(&eset->bins[bin_ind].heap)) {
+		return NULL;
+	}
+
+	edata_t *edata = NULL;
+	edata_heap_enumerate_helper_t helper;
+	edata_heap_enumerate_prepare(&eset->bins[bin_ind].heap, &helper,
+	    ESET_ENUMERATE_MAX_NUM, sizeof(helper.bfs_queue)/sizeof(void *));
+	while ((edata =
+	    edata_heap_enumerate_next(&eset->bins[bin_ind].heap, &helper)) !=
+	    NULL) {
+		uintptr_t base = (uintptr_t)edata_base_get(edata);
+		size_t candidate_size = edata_size_get(edata);
+		if (candidate_size < size) {
+			continue;
+		}
+
+		uintptr_t next_align = ALIGNMENT_CEILING((uintptr_t)base,
+		    PAGE_CEILING(alignment));
+		if (base > next_align || base + candidate_size <= next_align) {
+			/* Overflow or not crossing the next alignment. */
+			continue;
+		}
+
+		size_t leadsize = next_align - base;
+		if (candidate_size - leadsize >= size) {
+			return edata;
+		}
+	}
+
+	return NULL;
+}
+
+edata_t *
+eset_enumerate_search(eset_t *eset, size_t size, pszind_t bin_ind,
+    bool exact_only, edata_cmp_summary_t *ret_summ) {
+	if (edata_heap_empty(&eset->bins[bin_ind].heap)) {
+		return NULL;
+	}
+
+	edata_t *ret = NULL, *edata = NULL;
+	edata_heap_enumerate_helper_t helper;
+	edata_heap_enumerate_prepare(&eset->bins[bin_ind].heap, &helper,
+	    ESET_ENUMERATE_MAX_NUM, sizeof(helper.bfs_queue)/sizeof(void *));
+	while ((edata =
+	    edata_heap_enumerate_next(&eset->bins[bin_ind].heap, &helper)) !=
+	    NULL) {
+		if ((!exact_only && edata_size_get(edata) >= size) ||
+		    (exact_only && edata_size_get(edata) == size)) {
+			edata_cmp_summary_t temp_summ =
+			    edata_cmp_summary_get(edata);
+			if (ret == NULL || edata_cmp_summary_comp(temp_summ,
+			    *ret_summ) < 0) {
+				ret = edata;
+				*ret_summ = temp_summ;
+			}
+		}
+	}
+
+	return ret;
+}
+
 /*
  * Find an extent with size [min_size, max_size) to satisfy the alignment
  * requirement.  For each size, try only the first extent in the heap.
@@ -162,8 +227,19 @@ eset_remove(eset_t *eset, edata_t *edata) {
 static edata_t *
 eset_fit_alignment(eset_t *eset, size_t min_size, size_t max_size,
     size_t alignment) {
-        pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(min_size));
-        pszind_t pind_max = sz_psz2ind(sz_psz_quantize_ceil(max_size));
+	pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(min_size));
+	pszind_t pind_max = sz_psz2ind(sz_psz_quantize_ceil(max_size));
+
+	/* See comments in eset_first_fit for why we enumerate search below. */
+	pszind_t pind_prev = sz_psz2ind(sz_psz_quantize_floor(min_size));
+	if (sz_limit_usize_gap_enabled() && pind != pind_prev) {
+		edata_t *ret = NULL;
+		ret = eset_enumerate_alignment_search(eset, min_size, pind_prev,
+		    alignment);
+		if (ret != NULL) {
+			return ret;
+		}
+	}
 
 	for (pszind_t i =
 	    (pszind_t)fb_ffs(eset->bitmap, ESET_NPSIZES, (size_t)pind);
@@ -211,8 +287,43 @@ eset_first_fit(eset_t *eset, size_t size, bool exact_only,
 	pszind_t pind = sz_psz2ind(sz_psz_quantize_ceil(size));
 
 	if (exact_only) {
-		return edata_heap_empty(&eset->bins[pind].heap) ? NULL :
-		    edata_heap_first(&eset->bins[pind].heap);
+		if (sz_limit_usize_gap_enabled()) {
+			pszind_t pind_prev =
+			    sz_psz2ind(sz_psz_quantize_floor(size));
+			return eset_enumerate_search(eset, size, pind_prev,
+			    /* exact_only */ true, &ret_summ);
+		} else {
+			return edata_heap_empty(&eset->bins[pind].heap) ? NULL:
+			    edata_heap_first(&eset->bins[pind].heap);
+		}
+	}
+
+	/*
+	 * Each element in the eset->bins is a heap corresponding to a size
+	 * class.  When sz_limit_usize_gap_enabled() is false, all heaps after
+	 * pind (including pind itself) will surely satisfy the rquests while
+	 * heaps before pind cannot satisfy the request because usize is
+	 * calculated based on size classes then.  However, when
+	 * sz_limit_usize_gap_enabled() is true, usize is calculated by ceiling
+	 * user requested size to the closest multiple of PAGE.  This means in
+	 * the heap before pind, i.e., pind_prev, there may exist extents able
+	 * to satisfy the request and we should enumerate the heap when
+	 * pind_prev != pind.
+	 *
+	 * For example, when PAGE=4KB and the user requested size is 1MB + 4KB,
+	 * usize would be 1.25MB when sz_limit_usize_gap_enabled() is false.
+	 * pind points to the heap containing extents ranging in
+	 * [1.25MB, 1.5MB).  Thus, searching starting from pind will not miss
+	 * any candidates.  When sz_limit_usize_gap_enabled() is true, the
+	 * usize would be 1MB + 4KB and pind still points to the same heap.
+	 * In this case, the heap pind_prev points to, which contains extents
+	 * in the range [1MB, 1.25MB), may contain candidates satisfying the
+	 * usize and thus should be enumerated.
+	 */
+	pszind_t pind_prev = sz_psz2ind(sz_psz_quantize_floor(size));
+	if (sz_limit_usize_gap_enabled() && pind != pind_prev){
+		ret = eset_enumerate_search(eset, size, pind_prev,
+		    /* exact_only */ false, &ret_summ);
 	}
 
 	for (pszind_t i =
diff --git a/src/hpa.c b/src/hpa.c
index 932cf201..2a5d7e1f 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -706,7 +706,7 @@ hpa_alloc_batch_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
     bool *deferred_work_generated) {
 	assert(size <= HUGEPAGE);
 	assert(size <= shard->opts.slab_max_alloc ||
-	    size == sz_index2size(sz_size2index(size)));
+	    size == sz_s2u(size));
 	bool oom = false;
 
 	size_t nsuccess = hpa_try_alloc_batch_no_grow(tsdn, shard, size, &oom,
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 31d4cb27..67456bb7 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -123,6 +123,13 @@ zero_realloc_action_t opt_zero_realloc_action =
 
 atomic_zu_t zero_realloc_count = ATOMIC_INIT(0);
 
+bool opt_limit_usize_gap =
+#ifdef LIMIT_USIZE_GAP
+    true;
+#else
+    false;
+#endif
+
 const char *const zero_realloc_mode_names[] = {
 	"alloc",
 	"free",
@@ -1578,8 +1585,8 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    "hpa_sec_nshards", 0, 0, CONF_CHECK_MIN,
 			    CONF_DONT_CHECK_MAX, true);
 			CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_alloc,
-			    "hpa_sec_max_alloc", PAGE, 0, CONF_CHECK_MIN,
-			    CONF_DONT_CHECK_MAX, true);
+			    "hpa_sec_max_alloc", PAGE, USIZE_GROW_SLOW_THRESHOLD,
+			    CONF_CHECK_MIN, CONF_CHECK_MAX, true);
 			CONF_HANDLE_SIZE_T(opt_hpa_sec_opts.max_bytes,
 			    "hpa_sec_max_bytes", PAGE, 0, CONF_CHECK_MIN,
 			    CONF_DONT_CHECK_MAX, true);
@@ -1763,6 +1770,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    "san_guard_large", 0, SIZE_T_MAX,
 			    CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false)
 
+			if (config_limit_usize_gap) {
+				CONF_HANDLE_BOOL(opt_limit_usize_gap,
+				    "limit_usize_gap");
+			}
+
 			CONF_ERROR("Invalid conf pair", k, klen, v, vlen);
 #undef CONF_ERROR
 #undef CONF_CONTINUE
@@ -2182,6 +2194,17 @@ static bool
 malloc_init_hard(void) {
 	tsd_t *tsd;
 
+	if (config_limit_usize_gap) {
+		assert(TCACHE_MAXCLASS_LIMIT <= USIZE_GROW_SLOW_THRESHOLD);
+		assert(SC_LOOKUP_MAXCLASS <= USIZE_GROW_SLOW_THRESHOLD);
+		/*
+		 * This asserts an extreme case where TINY_MAXCLASS is larger
+		 * than LARGE_MINCLASS.  It could only happen if some constants
+		 * are configured miserably wrong.
+		 */
+		assert(SC_LG_TINY_MAXCLASS <=
+		    (size_t)1ULL << (LG_PAGE + SC_LG_NGROUP));
+	}
 #if defined(_WIN32) && _WIN32_WINNT < 0x0600
 	_init_init_lock();
 #endif
@@ -2376,7 +2399,8 @@ aligned_usize_get(size_t size, size_t alignment, size_t *usize, szind_t *ind,
 			if (unlikely(*ind >= SC_NSIZES)) {
 				return true;
 			}
-			*usize = sz_index2size(*ind);
+			*usize = sz_limit_usize_gap_enabled()? sz_s2u(size):
+			    sz_index2size(*ind);
 			assert(*usize > 0 && *usize <= SC_LARGE_MAXCLASS);
 			return false;
 		}
@@ -2924,7 +2948,7 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
 	    &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);
 
-	size_t usize = sz_index2size(alloc_ctx.szind);
+	size_t usize = emap_alloc_ctx_usize_get(&alloc_ctx);
 	if (config_prof && opt_prof) {
 		prof_free(tsd, ptr, usize, &alloc_ctx);
 	}
@@ -2956,35 +2980,41 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
 	assert(malloc_initialized() || IS_INITIALIZER);
 
 	emap_alloc_ctx_t alloc_ctx;
+	szind_t szind = sz_size2index(usize);
 	if (!config_prof) {
-		alloc_ctx.szind = sz_size2index(usize);
-		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+		emap_alloc_ctx_init(&alloc_ctx, szind, (szind < SC_NBINS),
+		    usize);
 	} else {
 		if (likely(!prof_sample_aligned(ptr))) {
 			/*
 			 * When the ptr is not page aligned, it was not sampled.
 			 * usize can be trusted to determine szind and slab.
 			 */
-			alloc_ctx.szind = sz_size2index(usize);
-			alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+			emap_alloc_ctx_init(&alloc_ctx, szind,
+			    (szind < SC_NBINS), usize);
 		} else if (opt_prof) {
+			/*
+			 * Small sampled allocs promoted can still get correct
+			 * usize here.  Check comments in edata_usize_get.
+			 */
 			emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global,
 			    ptr, &alloc_ctx);
 
 			if (config_opt_safety_checks) {
 				/* Small alloc may have !slab (sampled). */
+				size_t true_size =
+				    emap_alloc_ctx_usize_get(&alloc_ctx);
 				if (unlikely(alloc_ctx.szind !=
 				    sz_size2index(usize))) {
 					safety_check_fail_sized_dealloc(
 					    /* current_dealloc */ true, ptr,
-					    /* true_size */ sz_index2size(
-					    alloc_ctx.szind),
+					    /* true_size */ true_size,
 					    /* input_size */ usize);
 				}
 			}
 		} else {
-			alloc_ctx.szind = sz_size2index(usize);
-			alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+			emap_alloc_ctx_init(&alloc_ctx, szind,
+			    (szind < SC_NBINS), usize);
 		}
 	}
 	bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
@@ -3486,7 +3516,7 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) {
 	emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr,
 	    &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);
-	old_usize = sz_index2size(alloc_ctx.szind);
+	old_usize = emap_alloc_ctx_usize_get(&alloc_ctx);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 	if (aligned_usize_get(size, alignment, &usize, NULL, false)) {
 		goto label_oom;
@@ -3756,7 +3786,7 @@ je_xallocx(void *ptr, size_t size, size_t extra, int flags) {
 	emap_alloc_ctx_lookup(tsd_tsdn(tsd), &arena_emap_global, ptr,
 	    &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);
-	old_usize = sz_index2size(alloc_ctx.szind);
+	old_usize = emap_alloc_ctx_usize_get(&alloc_ctx);
 	assert(old_usize == isalloc(tsd_tsdn(tsd), ptr));
 	/*
 	 * The API explicitly absolves itself of protecting against (size +
diff --git a/src/prof_data.c b/src/prof_data.c
index 39af0c90..437673ee 100644
--- a/src/prof_data.c
+++ b/src/prof_data.c
@@ -513,7 +513,13 @@ void prof_unbias_map_init(void) {
 	/* See the comment in prof_sample_new_event_wait */
 #ifdef JEMALLOC_PROF
 	for (szind_t i = 0; i < SC_NSIZES; i++) {
-		double sz = (double)sz_index2size(i);
+		/*
+		 * When limit_usize_gap is enabled, the unbiased calculation
+		 * here is not as accurate as it was because usize now changes
+		 * in a finer grain while the unbiased_sz is still calculated
+		 * using the old way.
+		 */
+		double sz = (double)sz_index2size_unsafe(i);
 		double rate = (double)(ZU(1) << lg_prof_sample);
 		double div_val = 1.0 - exp(-sz / rate);
 		double unbiased_sz = sz / div_val;
diff --git a/src/psset.c b/src/psset.c
index 9a833193..e617f426 100644
--- a/src/psset.c
+++ b/src/psset.c
@@ -337,18 +337,50 @@ psset_update_end(psset_t *psset, hpdata_t *ps) {
 	hpdata_assert_consistent(ps);
 }
 
+hpdata_t *
+psset_enumerate_search(psset_t *psset, pszind_t pind, size_t size) {
+	if (hpdata_age_heap_empty(&psset->pageslabs[pind])) {
+		return NULL;
+	}
+
+	hpdata_t *ps = NULL;
+	hpdata_age_heap_enumerate_helper_t helper;
+	hpdata_age_heap_enumerate_prepare(&psset->pageslabs[pind], &helper,
+	    PSSET_ENUMERATE_MAX_NUM, sizeof(helper.bfs_queue) / sizeof(void *));
+
+	while ((ps = hpdata_age_heap_enumerate_next(&psset->pageslabs[pind],
+	    &helper))) {
+		if (hpdata_longest_free_range_get(ps) >= size) {
+			return ps;
+		}
+	}
+
+	return NULL;
+}
+
 hpdata_t *
 psset_pick_alloc(psset_t *psset, size_t size) {
 	assert((size & PAGE_MASK) == 0);
 	assert(size <= HUGEPAGE);
 
 	pszind_t min_pind = sz_psz2ind(sz_psz_quantize_ceil(size));
+	hpdata_t *ps = NULL;
+
+	/* See comments in eset_first_fit for why we enumerate search below. */
+	pszind_t pind_prev = sz_psz2ind(sz_psz_quantize_floor(size));
+	if (sz_limit_usize_gap_enabled() && pind_prev < min_pind) {
+		ps = psset_enumerate_search(psset, pind_prev, size);
+		if (ps != NULL) {
+			return ps;
+		}
+	}
+
 	pszind_t pind = (pszind_t)fb_ffs(psset->pageslab_bitmap, PSSET_NPSIZES,
 	    (size_t)min_pind);
 	if (pind == PSSET_NPSIZES) {
 		return hpdata_empty_list_first(&psset->empty);
 	}
-	hpdata_t *ps = hpdata_age_heap_first(&psset->pageslabs[pind]);
+	ps = hpdata_age_heap_first(&psset->pageslabs[pind]);
 	if (ps == NULL) {
 		return NULL;
 	}
diff --git a/src/sec.c b/src/sec.c
index 19d69ff4..8827d1bd 100644
--- a/src/sec.c
+++ b/src/sec.c
@@ -24,6 +24,13 @@ bool
 sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, pai_t *fallback,
     const sec_opts_t *opts) {
 	assert(opts->max_alloc >= PAGE);
+	/*
+	 * Same as tcache, sec do not cache allocs/dallocs larger than
+	 * USIZE_GROW_SLOW_THRESHOLD because the usize above this increases
+	 * by PAGE and the number of usizes is too large.
+	 */
+	assert(!sz_limit_usize_gap_enabled() ||
+	    opts->max_alloc <= USIZE_GROW_SLOW_THRESHOLD);
 
 	size_t max_alloc = PAGE_FLOOR(opts->max_alloc);
 	pszind_t npsizes = sz_psz2ind(max_alloc) + 1;
diff --git a/src/tcache.c b/src/tcache.c
index 15da14da..270d38ac 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -1047,7 +1047,8 @@ tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin
 				ndeferred++;
 				continue;
 			}
-			if (large_dalloc_safety_checks(edata, ptr, binind)) {
+			if (large_dalloc_safety_checks(edata, ptr,
+			    sz_index2size(binind))) {
 				/* See the comment in isfree. */
 				continue;
 			}
diff --git a/test/integration/rallocx.c b/test/integration/rallocx.c
index 68b8f381..85d9238b 100644
--- a/test/integration/rallocx.c
+++ b/test/integration/rallocx.c
@@ -49,7 +49,7 @@ TEST_BEGIN(test_grow_and_shrink) {
 	size_t tsz;
 #define NCYCLES 3
 	unsigned i, j;
-#define NSZS 1024
+#define NSZS 64
 	size_t szs[NSZS];
 #define MAXSZ ZU(12 * 1024 * 1024)
 
diff --git a/test/test.sh.in b/test/test.sh.in
index b4fbb355..a4ee9396 100644
--- a/test/test.sh.in
+++ b/test/test.sh.in
@@ -43,6 +43,7 @@ for t in $@; do
     # per test shell script to ignore the @JEMALLOC_CPREFIX@ detail).
     enable_fill=@enable_fill@ \
     enable_prof=@enable_prof@ \
+    limit_usize_gap=@limit_usize_gap@ \
     . @srcroot@${t}.sh && \
     export_malloc_conf && \
     $JEMALLOC_TEST_PREFIX ${t}@exe@ @abs_srcroot@ @abs_objroot@
diff --git a/test/unit/arena_reset.c b/test/unit/arena_reset.c
index 8ef0786c..09536b29 100644
--- a/test/unit/arena_reset.c
+++ b/test/unit/arena_reset.c
@@ -78,7 +78,8 @@ vsalloc(tsdn_t *tsdn, const void *ptr) {
 		return 0;
 	}
 
-	return sz_index2size(full_alloc_ctx.szind);
+	return config_limit_usize_gap? edata_usize_get(full_alloc_ctx.edata):
+	    sz_index2size(full_alloc_ctx.szind);
 }
 
 static unsigned
diff --git a/test/unit/hpa.c b/test/unit/hpa.c
index 50b96a87..6c42729a 100644
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@@ -5,7 +5,7 @@
 
 #define SHARD_IND 111
 
-#define ALLOC_MAX (HUGEPAGE / 4)
+#define ALLOC_MAX (HUGEPAGE)
 
 typedef struct test_data_s test_data_t;
 struct test_data_s {
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 02fedaa7..296b7bff 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -332,6 +332,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(bool, prof_sys_thread_name, prof);
 	TEST_MALLCTL_OPT(ssize_t, lg_san_uaf_align, uaf_detection);
 	TEST_MALLCTL_OPT(unsigned, debug_double_free_max_scan, always);
+	TEST_MALLCTL_OPT(bool, limit_usize_gap, limit_usize_gap);
 
 #undef TEST_MALLCTL_OPT
 }
diff --git a/test/unit/ph.c b/test/unit/ph.c
index 28f5e488..0339f993 100644
--- a/test/unit/ph.c
+++ b/test/unit/ph.c
@@ -2,8 +2,9 @@
 
 #include "jemalloc/internal/ph.h"
 
+#define BFS_ENUMERATE_MAX 30
 typedef struct node_s node_t;
-ph_structs(heap, node_t);
+ph_structs(heap, node_t, BFS_ENUMERATE_MAX);
 
 struct node_s {
 #define NODE_MAGIC 0x9823af7e
@@ -239,6 +240,22 @@ TEST_BEGIN(test_ph_random) {
 			expect_false(heap_empty(&heap),
 			    "Heap should not be empty");
 
+			/* Enumerate nodes. */
+			heap_enumerate_helper_t helper;
+			uint16_t max_queue_size = sizeof(helper.bfs_queue)
+			    / sizeof(void *);
+			expect_u_eq(max_queue_size, BFS_ENUMERATE_MAX,
+			    "Incorrect bfs queue length initialized");
+			assert(max_queue_size == BFS_ENUMERATE_MAX);
+			heap_enumerate_prepare(&heap, &helper,
+			    BFS_ENUMERATE_MAX, max_queue_size);
+			size_t node_count = 0;
+			while(heap_enumerate_next(&heap, &helper)) {
+				node_count ++;
+			}
+			expect_lu_eq(node_count, j,
+			    "Unexpected enumeration results.");
+
 			/* Remove nodes. */
 			switch (i % 6) {
 			case 0:
diff --git a/test/unit/sec.c b/test/unit/sec.c
index 0b5e1c31..cfef043f 100644
--- a/test/unit/sec.c
+++ b/test/unit/sec.c
@@ -412,7 +412,8 @@ TEST_BEGIN(test_expand_shrink_delegate) {
 
 	bool deferred_work_generated = false;
 
-	test_sec_init(&sec, &ta.pai, /* nshards */ 1, /* max_alloc */ 10 * PAGE,
+	test_sec_init(&sec, &ta.pai, /* nshards */ 1,
+	    /* max_alloc */ USIZE_GROW_SLOW_THRESHOLD,
 	    /* max_bytes */ 1000 * PAGE);
 	edata_t *edata = pai_alloc(tsdn, &sec.pai, PAGE, PAGE,
 	    /* zero */ false, /* guarded */ false, /* frequent_reuse */ false,
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
index 9e8a408f..24913803 100644
--- a/test/unit/size_classes.c
+++ b/test/unit/size_classes.c
@@ -26,7 +26,8 @@ TEST_BEGIN(test_size_classes) {
 	size_t size_class, max_size_class;
 	szind_t index, gen_index, max_index;
 
-	max_size_class = get_max_size_class();
+	max_size_class = sz_limit_usize_gap_enabled()? SC_SMALL_MAXCLASS:
+	    get_max_size_class();
 	max_index = sz_size2index(max_size_class);
 
 	for (index = 0, size_class = sz_index2size(index); index < max_index ||
@@ -79,6 +80,40 @@ TEST_BEGIN(test_size_classes) {
 }
 TEST_END
 
+TEST_BEGIN(test_grow_slow_size_classes) {
+	test_skip_if(!sz_limit_usize_gap_enabled());
+
+	size_t size = SC_LARGE_MINCLASS;
+	size_t target_usize = SC_LARGE_MINCLASS;
+	size_t max_size = get_max_size_class();
+	size_t increase[3] = {PAGE - 1, 1, 1};
+	while (size <= max_size) {
+		size_t usize = sz_s2u(size);
+		expect_zu_eq(usize, target_usize,
+		    "sz_s2u() does not generate usize as expected.");
+		size += increase[0];
+		usize = sz_s2u(size);
+		target_usize += PAGE;
+		expect_zu_eq(usize, target_usize,
+		    "sz_s2u() does not generate usize as expected.");
+		size += increase[1];
+		usize = sz_s2u(size);
+		expect_zu_eq(usize, target_usize,
+		    "sz_s2u() does not generate usize as expected.");
+		size += increase[2];
+		usize = sz_s2u(size);
+		target_usize += PAGE;
+		expect_zu_eq(usize, target_usize,
+		    "sz_s2u() does not generate usize as expected.");
+		if (target_usize << 1 < target_usize) {
+			break;
+		}
+		target_usize = target_usize << 1;
+		size = target_usize;
+	}
+}
+TEST_END
+
 TEST_BEGIN(test_psize_classes) {
 	size_t size_class, max_psz;
 	pszind_t pind, max_pind;
@@ -182,6 +217,7 @@ int
 main(void) {
 	return test(
 	    test_size_classes,
+	    test_grow_slow_size_classes,
 	    test_psize_classes,
 	    test_overflow);
 }
diff --git a/test/unit/size_classes.sh b/test/unit/size_classes.sh
new file mode 100644
index 00000000..93d5e8d1
--- /dev/null
+++ b/test/unit/size_classes.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+
+if [ "x${limit_usize_gap}" = "x1" ] ; then
+  export MALLOC_CONF="limit_usize_gap:true"
+fi
diff --git a/test/unit/stats.c b/test/unit/stats.c
index 203a71b5..584a582f 100644
--- a/test/unit/stats.c
+++ b/test/unit/stats.c
@@ -202,17 +202,22 @@ TEST_END
 
 TEST_BEGIN(test_stats_arenas_large) {
 	void *p;
-	size_t sz, allocated;
+	size_t sz, allocated, allocated_before;
 	uint64_t epoch, nmalloc, ndalloc;
+	size_t malloc_size = (1U << (SC_LG_LARGE_MINCLASS + 1)) + 1;
 	int expected = config_stats ? 0 : ENOENT;
 
-	p = mallocx((1U << SC_LG_LARGE_MINCLASS), MALLOCX_ARENA(0));
+	sz = sizeof(size_t);
+	expect_d_eq(mallctl("stats.arenas.0.large.allocated",
+	    (void *)&allocated_before, &sz, NULL, 0), expected,
+	    "Unexpected mallctl() result");
+
+	p = mallocx(malloc_size, MALLOCX_ARENA(0));
 	expect_ptr_not_null(p, "Unexpected mallocx() failure");
 
 	expect_d_eq(mallctl("epoch", NULL, NULL, (void *)&epoch, sizeof(epoch)),
 	    0, "Unexpected mallctl() failure");
 
-	sz = sizeof(size_t);
 	expect_d_eq(mallctl("stats.arenas.0.large.allocated",
 	    (void *)&allocated, &sz, NULL, 0), expected,
 	    "Unexpected mallctl() result");
@@ -223,8 +228,10 @@ TEST_BEGIN(test_stats_arenas_large) {
 	    &sz, NULL, 0), expected, "Unexpected mallctl() result");
 
 	if (config_stats) {
-		expect_zu_gt(allocated, 0,
+		expect_zu_ge(allocated_before, 0,
 		    "allocated should be greater than zero");
+		expect_zu_ge(allocated - allocated_before, sz_s2u(malloc_size),
+		    "the diff between allocated should be greater than the allocation made");
 		expect_u64_gt(nmalloc, 0,
 		    "nmalloc should be greater than zero");
 		expect_u64_ge(nmalloc, ndalloc,