From ad108d50f1c30700389103ff5fe3ef5f538f804c Mon Sep 17 00:00:00 2001
From: Dmitry Ilvokhin <d@ilvokhin.com>
Date: Tue, 21 Jan 2025 07:20:15 -0800
Subject: [PATCH] Extend purging algorithm with peak demand tracking

Implementation inspired by idea described in "Beyond malloc efficiency
to fleet efficiency: a hugepage-aware memory allocator" paper [1].

Primary idea is to track maximum number (peak) of active pages in use
with sliding window and then use this number to decide how many dirty
pages we would like to keep.

We are trying to estimate maximum amount of active memory we'll need in
the near future. We do so by projecting future active memory demand
(based on peak active memory usage we observed in the past within
sliding window) and adding slack on top of it (an overhead is reasonable
to have in exchange of higher hugepages coverage). When peak demand
tracking is off, projection of future active memory is active memory we
are having right now.

Estimation is essentially the same as `nactive_max * (1 + dirty_mult)`.

Peak demand purging algorithm controlled by two config options. Option
`hpa_peak_demand_window_ms` controls duration of sliding window we track
maximum active memory usage in and option `hpa_dirty_mult` controls
amount of slack we are allowed to have as a percent from maximum active
memory usage. By default `hpa_peak_demand_window_ms == 0` now and we
have same behaviour (ratio based purging) that we had before this
commit.

[1]: https://storage.googleapis.com/gweb-research2023-media/pubtools/6170.pdf
---
 Makefile.in                                   |   2 +
 include/jemalloc/internal/hpa.h               |   4 +
 include/jemalloc/internal/hpa_opts.h          |  14 +-
 include/jemalloc/internal/peak_demand.h       |  55 ++++++
 .../projects/vc2015/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2015/jemalloc/jemalloc.vcxproj.filters  |   3 +
 .../projects/vc2017/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2017/jemalloc/jemalloc.vcxproj.filters  |   3 +
 .../projects/vc2019/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2019/jemalloc/jemalloc.vcxproj.filters  |   3 +
 .../projects/vc2022/jemalloc/jemalloc.vcxproj |   1 +
 .../vc2022/jemalloc/jemalloc.vcxproj.filters  |   3 +
 src/ctl.c                                     |   5 +
 src/hpa.c                                     |  53 +++++-
 src/jemalloc.c                                |   5 +
 src/peak_demand.c                             |  74 ++++++++
 src/stats.c                                   |   1 +
 test/unit/hpa.c                               | 174 +++++++++++++++---
 test/unit/mallctl.c                           |   1 +
 test/unit/peak_demand.c                       | 162 ++++++++++++++++
 20 files changed, 537 insertions(+), 29 deletions(-)
 create mode 100644 include/jemalloc/internal/peak_demand.h
 create mode 100644 src/peak_demand.c
 create mode 100644 test/unit/peak_demand.c

diff --git a/Makefile.in b/Makefile.in
index 1914fc28..b4102d0b 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -137,6 +137,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/pai.c \
 	$(srcroot)src/pac.c \
 	$(srcroot)src/pages.c \
+	$(srcroot)src/peak_demand.c \
 	$(srcroot)src/peak_event.c \
 	$(srcroot)src/prof.c \
 	$(srcroot)src/prof_data.c \
@@ -252,6 +253,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/pack.c \
 	$(srcroot)test/unit/pages.c \
 	$(srcroot)test/unit/peak.c \
+	$(srcroot)test/unit/peak_demand.c \
 	$(srcroot)test/unit/ph.c \
 	$(srcroot)test/unit/prng.c \
 	$(srcroot)test/unit/prof_accum.c \
diff --git a/include/jemalloc/internal/hpa.h b/include/jemalloc/internal/hpa.h
index d788d051..a384d04a 100644
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@@ -10,6 +10,7 @@
 #include "jemalloc/internal/hpa_opts.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/pai.h"
+#include "jemalloc/internal/peak_demand.h"
 #include "jemalloc/internal/psset.h"
 
 typedef struct hpa_central_s hpa_central_t;
@@ -147,6 +148,9 @@ struct hpa_shard_s {
 	 * Last time we performed purge on this shard.
 	 */
 	nstime_t last_purge;
+
+	/* Peak active memory sliding window statistics. */
+	peak_demand_t peak_demand;
 };
 
 bool hpa_hugepage_size_exceeds_limit();
diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h
index 42246172..816bb577 100644
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@@ -27,7 +27,8 @@ struct hpa_shard_opts_s {
 
 	/*
 	 * The HPA purges whenever the number of pages exceeds dirty_mult *
-	 * active_pages.  This may be set to (fxp_t)-1 to disable purging.
+	 * peak_active_pages.  This may be set to (fxp_t)-1 to disable
+	 * purging.
 	 */
 	fxp_t dirty_mult;
 
@@ -59,6 +60,13 @@ struct hpa_shard_opts_s {
 	 * Maximum number of hugepages to purge on each purging attempt.
 	 */
 	ssize_t experimental_max_purge_nhp;
+
+	/*
+	 * Sliding window duration to track active memory demand statistics.
+	 * This might be set to 0, to disable sliding window statistics
+	 * tracking and use current number of active pages for purging instead.
+	 */
+	uint64_t peak_demand_window_ms;
 };
 
 #define HPA_SHARD_OPTS_DEFAULT {					\
@@ -83,7 +91,9 @@ struct hpa_shard_opts_s {
 	/* min_purge_interval_ms */					\
 	5 * 1000,							\
 	/* experimental_max_purge_nhp */				\
-	-1								\
+	-1,								\
+	/* peak_demand_window_ms */					\
+	0								\
 }
 
 #endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */
diff --git a/include/jemalloc/internal/peak_demand.h b/include/jemalloc/internal/peak_demand.h
new file mode 100644
index 00000000..2664cbec
--- /dev/null
+++ b/include/jemalloc/internal/peak_demand.h
@@ -0,0 +1,55 @@
+#ifndef JEMALLOC_INTERNAL_PEAK_DEMAND_H
+#define JEMALLOC_INTERNAL_PEAK_DEMAND_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+/*
+ * Implementation of peak active memory demand tracking.
+ *
+ * Inspired by "Beyond malloc efficiency to fleet efficiency: a hugepage-aware
+ * memory allocator" whitepaper.
+ * https://storage.googleapis.com/gweb-research2023-media/pubtools/6170.pdf
+ *
+ * End goal is to track peak active memory usage over specified time interval.
+ * We do so by dividing this time interval into disjoint subintervals and
+ * storing value of maximum memory usage for each subinterval in a circular
+ * buffer.  Nanoseconds resolution timestamp uniquely maps into epoch, which is
+ * used as an index to access circular buffer.
+ */
+
+#define PEAK_DEMAND_LG_BUCKETS 4
+/*
+ * Number of buckets should be power of 2 to ensure modulo operation is
+ * optimized to bit masking by the compiler.
+ */
+#define PEAK_DEMAND_NBUCKETS (1 << PEAK_DEMAND_LG_BUCKETS)
+
+typedef struct peak_demand_s peak_demand_t;
+struct peak_demand_s {
+	/*
+	 * Absolute value of current epoch, monotonically increases over time.  Epoch
+	 * value modulo number of buckets used as an index to access nactive_max
+	 * array.
+	 */
+	uint64_t epoch;
+
+	/* How many nanoseconds each epoch approximately takes. */
+	uint64_t epoch_interval_ns;
+
+	/*
+	 * Circular buffer to track maximum number of active pages for each
+	 * epoch.
+	 */
+	size_t nactive_max[PEAK_DEMAND_NBUCKETS];
+};
+
+void peak_demand_init(peak_demand_t *peak_demand, uint64_t interval_ms);
+
+/* Updates peak demand statistics with current number of active pages. */
+void peak_demand_update(peak_demand_t *peak_demand, const nstime_t *now,
+    size_t nactive);
+
+/* Returns maximum number of active pages in sliding window. */
+size_t peak_demand_nactive_max(peak_demand_t *peak_demand);
+
+#endif /* JEMALLOC_INTERNAL_PEAK_DEMAND_H */
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
index c43b30b1..97a95fbf 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj
@@ -76,6 +76,7 @@
     <ClCompile Include="..\..\..\..\src\pai.c" />
     <ClCompile Include="..\..\..\..\src\pac.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_demand.c" />
     <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
index f091475e..1a89369e 100644
--- a/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2015/jemalloc/jemalloc.vcxproj.filters
@@ -112,6 +112,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_demand.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\peak_event.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
index a195f6b3..8529438c 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj
@@ -76,6 +76,7 @@
     <ClCompile Include="..\..\..\..\src\pai.c" />
     <ClCompile Include="..\..\..\..\src\pac.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_demand.c" />
     <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
index f091475e..1a89369e 100644
--- a/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2017/jemalloc/jemalloc.vcxproj.filters
@@ -112,6 +112,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_demand.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\peak_event.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj
index cd16005d..eace48ba 100644
--- a/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj
@@ -76,6 +76,7 @@
     <ClCompile Include="..\..\..\..\src\pai.c" />
     <ClCompile Include="..\..\..\..\src\pac.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_demand.c" />
     <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj.filters
index f091475e..1a89369e 100644
--- a/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2019/jemalloc/jemalloc.vcxproj.filters
@@ -112,6 +112,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_demand.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\peak_event.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj b/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj
index 2d8c4be6..98085cfd 100644
--- a/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj
+++ b/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj
@@ -76,6 +76,7 @@
     <ClCompile Include="..\..\..\..\src\pai.c" />
     <ClCompile Include="..\..\..\..\src\pac.c" />
     <ClCompile Include="..\..\..\..\src\pages.c" />
+    <ClCompile Include="..\..\..\..\src\peak_demand.c" />
     <ClCompile Include="..\..\..\..\src\peak_event.c" />
     <ClCompile Include="..\..\..\..\src\prof.c" />
     <ClCompile Include="..\..\..\..\src\prof_data.c" />
diff --git a/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj.filters b/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj.filters
index f091475e..1a89369e 100644
--- a/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj.filters
+++ b/msvc/projects/vc2022/jemalloc/jemalloc.vcxproj.filters
@@ -112,6 +112,9 @@
     <ClCompile Include="..\..\..\..\src\pages.c">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="..\..\..\..\src\peak_demand.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="..\..\..\..\src\peak_event.c">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/src/ctl.c b/src/ctl.c
index c55d9719..2c941ae8 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -106,6 +106,7 @@ CTL_PROTO(opt_hpa_hugify_delay_ms)
 CTL_PROTO(opt_hpa_hugify_sync)
 CTL_PROTO(opt_hpa_min_purge_interval_ms)
 CTL_PROTO(opt_experimental_hpa_max_purge_nhp)
+CTL_PROTO(opt_hpa_peak_demand_window_ms)
 CTL_PROTO(opt_hpa_dirty_mult)
 CTL_PROTO(opt_hpa_sec_nshards)
 CTL_PROTO(opt_hpa_sec_max_alloc)
@@ -487,6 +488,8 @@ static const ctl_named_node_t opt_node[] = {
 	{NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)},
 	{NAME("experimental_hpa_max_purge_nhp"),
 		CTL(opt_experimental_hpa_max_purge_nhp)},
+	{NAME("hpa_peak_demand_window_ms"),
+	    CTL(opt_hpa_peak_demand_window_ms)},
 	{NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)},
 	{NAME("hpa_sec_nshards"),	CTL(opt_hpa_sec_nshards)},
 	{NAME("hpa_sec_max_alloc"),	CTL(opt_hpa_sec_max_alloc)},
@@ -2255,6 +2258,8 @@ CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms,
     uint64_t)
 CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp,
     opt_hpa_opts.experimental_max_purge_nhp, ssize_t)
+CTL_RO_NL_GEN(opt_hpa_peak_demand_window_ms,
+    opt_hpa_opts.peak_demand_window_ms, uint64_t)
 
 /*
  * This will have to change before we publicly document this option; fxp_t and
diff --git a/src/hpa.c b/src/hpa.c
index 2a5d7e1f..c01dde13 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -63,6 +63,11 @@ hpa_supported(void) {
 	return true;
 }
 
+static bool
+hpa_peak_demand_tracking_enabled(hpa_shard_t *shard) {
+	return shard->opts.peak_demand_window_ms > 0;
+}
+
 static void
 hpa_do_consistency_checks(hpa_shard_t *shard) {
 	assert(shard->base != NULL);
@@ -217,6 +222,11 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
 	shard->stats.nhugify_failures = 0;
 	shard->stats.ndehugifies = 0;
 
+	if (hpa_peak_demand_tracking_enabled(shard)) {
+		peak_demand_init(&shard->peak_demand,
+		    shard->opts.peak_demand_window_ms);
+	}
+
 	/*
 	 * Fill these in last, so that if an hpa_shard gets used despite
 	 * initialization failing, we'll at least crash instead of just
@@ -294,8 +304,37 @@ hpa_ndirty_max(tsdn_t *tsdn, hpa_shard_t *shard) {
 	if (shard->opts.dirty_mult == (fxp_t)-1) {
 		return (size_t)-1;
 	}
-	return fxp_mul_frac(psset_nactive(&shard->psset),
-	    shard->opts.dirty_mult);
+	/*
+	 * We are trying to estimate maximum amount of active memory we'll
+	 * need in the near future.  We do so by projecting future active
+	 * memory demand (based on peak active memory usage we observed in the
+	 * past within sliding window) and adding slack on top of it (an
+	 * overhead is reasonable to have in exchange of higher hugepages
+	 * coverage).  When peak demand tracking is off, projection of future
+	 * active memory is active memory we are having right now.
+	 *
+	 * Estimation is essentially the same as nactive_max * (1 +
+	 * dirty_mult), but expressed differently to factor in necessary
+	 * implementation details.
+	 */
+	size_t nactive = psset_nactive(&shard->psset);
+	size_t nactive_max = nactive;
+	if (hpa_peak_demand_tracking_enabled(shard)) {
+		/*
+		 * We release shard->mtx, when we do a syscall to purge dirty
+		 * memory, so someone might grab shard->mtx, allocate memory
+		 * from this shard and update psset's nactive counter, before
+		 * peak_demand_update(...) was called and we'll get
+		 * peak_demand_nactive_max(...) <= nactive as a result.
+		 */
+		size_t peak = peak_demand_nactive_max(&shard->peak_demand);
+		if (peak > nactive_max) {
+			nactive_max = peak;
+		}
+	}
+	size_t slack = fxp_mul_frac(nactive_max, shard->opts.dirty_mult);
+	size_t estimation = nactive_max + slack;
+	return estimation - nactive;
 }
 
 static bool
@@ -548,6 +587,16 @@ static void
 hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
     bool forced) {
 	malloc_mutex_assert_owner(tsdn, &shard->mtx);
+
+	/* Update active memory demand statistics. */
+	if (hpa_peak_demand_tracking_enabled(shard)) {
+		nstime_t now;
+		shard->central->hooks.curtime(&now,
+		    /* first_reading */ true);
+		peak_demand_update(&shard->peak_demand, &now,
+		    psset_nactive(&shard->psset));
+	}
+
 	if (!forced && shard->opts.deferral_allowed) {
 		return;
 	}
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 9f4bc785..d08771f8 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1568,6 +1568,11 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    opt_hpa_opts.experimental_max_purge_nhp,
 			    "experimental_hpa_max_purge_nhp", -1, SSIZE_MAX);
 
+			CONF_HANDLE_UINT64_T(
+			    opt_hpa_opts.peak_demand_window_ms,
+			    "hpa_peak_demand_window_ms", 0, 0,
+			    CONF_DONT_CHECK_MIN, CONF_DONT_CHECK_MAX, false);
+
 			if (CONF_MATCH("hpa_dirty_mult")) {
 				if (CONF_MATCH_VALUE("-1")) {
 					opt_hpa_opts.dirty_mult = (fxp_t)-1;
diff --git a/src/peak_demand.c b/src/peak_demand.c
new file mode 100644
index 00000000..49f28930
--- /dev/null
+++ b/src/peak_demand.c
@@ -0,0 +1,74 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/peak_demand.h"
+
+void
+peak_demand_init(peak_demand_t *peak_demand, uint64_t interval_ms) {
+	assert(interval_ms > 0);
+	peak_demand->epoch = 0;
+	uint64_t interval_ns = interval_ms * 1000 * 1000;
+	peak_demand->epoch_interval_ns = interval_ns / PEAK_DEMAND_NBUCKETS;
+	memset(peak_demand->nactive_max, 0, sizeof(peak_demand->nactive_max));
+}
+
+static uint64_t
+peak_demand_epoch_ind(peak_demand_t *peak_demand) {
+	return peak_demand->epoch % PEAK_DEMAND_NBUCKETS;
+}
+
+static nstime_t
+peak_demand_next_epoch_advance(peak_demand_t *peak_demand) {
+	uint64_t epoch = peak_demand->epoch;
+	uint64_t ns = (epoch + 1) * peak_demand->epoch_interval_ns;
+	nstime_t next;
+	nstime_init(&next, ns);
+	return next;
+}
+
+static uint64_t
+peak_demand_maybe_advance_epoch(peak_demand_t *peak_demand,
+    const nstime_t *now) {
+	nstime_t next_epoch_advance =
+	    peak_demand_next_epoch_advance(peak_demand);
+	if (nstime_compare(now, &next_epoch_advance) < 0) {
+		return peak_demand_epoch_ind(peak_demand);
+	}
+	uint64_t next_epoch = nstime_ns(now) / peak_demand->epoch_interval_ns;
+	assert(next_epoch > peak_demand->epoch);
+	/*
+	 * If we missed more epochs, than capacity of circular buffer
+	 * (PEAK_DEMAND_NBUCKETS), re-write no more than PEAK_DEMAND_NBUCKETS
+	 * items as we don't want to zero out same item multiple times.
+	 */
+	if (peak_demand->epoch + PEAK_DEMAND_NBUCKETS < next_epoch) {
+		peak_demand->epoch = next_epoch - PEAK_DEMAND_NBUCKETS;
+	}
+	while (peak_demand->epoch < next_epoch) {
+		++peak_demand->epoch;
+		uint64_t ind = peak_demand_epoch_ind(peak_demand);
+		peak_demand->nactive_max[ind] = 0;
+	}
+	return peak_demand_epoch_ind(peak_demand);
+}
+
+void
+peak_demand_update(peak_demand_t *peak_demand, const nstime_t *now,
+    size_t nactive) {
+	uint64_t ind = peak_demand_maybe_advance_epoch(peak_demand, now);
+	size_t *epoch_nactive = &peak_demand->nactive_max[ind];
+	if (nactive > *epoch_nactive) {
+		*epoch_nactive = nactive;
+	}
+}
+
+size_t
+peak_demand_nactive_max(peak_demand_t *peak_demand) {
+	size_t nactive_max = peak_demand->nactive_max[0];
+	for (int i = 1; i < PEAK_DEMAND_NBUCKETS; ++i) {
+		if (peak_demand->nactive_max[i] > nactive_max) {
+			nactive_max = peak_demand->nactive_max[i];
+		}
+	}
+	return nactive_max;
+}
diff --git a/src/stats.c b/src/stats.c
index 58874bf8..bd0167fb 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -1657,6 +1657,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_BOOL("hpa_hugify_sync")
 	OPT_WRITE_UINT64("hpa_min_purge_interval_ms")
 	OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp")
+	OPT_WRITE_UINT64("hpa_peak_demand_window_ms")
 	if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0)
 	    == 0) {
 		/*
diff --git a/test/unit/hpa.c b/test/unit/hpa.c
index 6c42729a..ceed9bd8 100644
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@@ -37,26 +37,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* experimental_max_purge_nhp */
-	-1
-};
-
-static hpa_shard_opts_t test_hpa_shard_opts_purge = {
-	/* slab_max_alloc */
-	HUGEPAGE,
-	/* hugification_threshold */
-	0.9 * HUGEPAGE,
-	/* dirty_mult */
-	FXP_INIT_PERCENT(11),
-	/* deferral_allowed */
-	true,
-	/* hugify_delay_ms */
-	0,
-	/* hugify_sync */
-	false,
-	/* min_purge_interval_ms */
-	5 * 1000,
-	/* experimental_max_purge_nhp */
-	-1
+	-1,
+	/* peak_demand_window_ms */
+	0
 };
 
 static hpa_shard_t *
@@ -480,8 +463,14 @@ TEST_END
 TEST_BEGIN(test_purge_no_infinite_loop) {
 	test_skip_if(!hpa_supported());
 
-	hpa_shard_t *shard = create_test_data(&hpa_hooks_default,
-	    &test_hpa_shard_opts_purge);
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.slab_max_alloc = HUGEPAGE;
+	opts.hugification_threshold = 0.9 * HUGEPAGE;
+	opts.dirty_mult = FXP_INIT_PERCENT(11);
+	opts.deferral_allowed = true;
+	opts.hugify_delay_ms = 0;
+
+	hpa_shard_t *shard = create_test_data(&hpa_hooks_default, &opts);
 	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
 
 	/*
@@ -489,8 +478,7 @@ TEST_BEGIN(test_purge_no_infinite_loop) {
 	 * criteria for huge page and at the same time do not allow hugify page
 	 * without triggering a purge.
 	 */
-	const size_t npages =
-	    test_hpa_shard_opts_purge.hugification_threshold / PAGE + 1;
+	const size_t npages = opts.hugification_threshold / PAGE + 1;
 	const size_t size = npages * PAGE;
 
 	bool deferred_work_generated = false;
@@ -733,6 +721,140 @@ TEST_BEGIN(test_experimental_max_purge_nhp) {
 }
 TEST_END
 
+TEST_BEGIN(test_demand_purge_slack) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	/* Allow 10% of slack. */
+	opts.dirty_mult = FXP_INIT_PERCENT(10);
+	/* Peak demand sliding window duration is 10 seconds. */
+	opts.peak_demand_window_ms = 10 * 1000;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 16 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+
+	/* Deallocate 5 hugepages out of 16. */
+	for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	nstime_init2(&defer_curtime, 6, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Peak demand within sliding window is 16 hugepages, so we don't need
+	 * to purge anything just yet.
+	 */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	nstime_init2(&defer_curtime, 12, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
+	ndefer_hugify_calls = 0;
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * 12 seconds passed now, peak demand is 11 hugepages, we allowed to
+	 * keep 11 * 0.1 (hpa_dirty_mult) = 1.1 dirty hugepages, but we
+	 * have 5 dirty hugepages, so we should purge 4 of them.
+	 */
+	expect_zu_eq(4, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	destroy_test_data(shard);
+}
+TEST_END
+
+TEST_BEGIN(test_demand_purge_tight) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	/* No slack allowed. */
+	opts.dirty_mult = FXP_INIT_PERCENT(0);
+	/* Peak demand sliding window duration is 10 seconds. */
+	opts.peak_demand_window_ms = 10 * 1000;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 16 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+
+	/* Deallocate 5 hugepages out of 16. */
+	for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	nstime_init2(&defer_curtime, 6, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Peak demand within sliding window is 16 hugepages, to purge anything
+	 * just yet.
+	 */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	nstime_init2(&defer_curtime, 12, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
+	ndefer_hugify_calls = 0;
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * 12 seconds passed now, peak demand is 11 hugepages.  We have
+	 * hpa_dirty_mult = 0, so we allowed to keep 11 * 0 = 0 dirty
+	 * hugepages, but we have 5, all of them should be purged.
+	 */
+	expect_zu_eq(5, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	destroy_test_data(shard);
+}
+TEST_END
+
 int
 main(void) {
 	/*
@@ -756,5 +878,7 @@ main(void) {
 	    test_no_min_purge_interval,
 	    test_min_purge_interval,
 	    test_purge,
-	    test_experimental_max_purge_nhp);
+	    test_experimental_max_purge_nhp,
+	    test_demand_purge_slack,
+	    test_demand_purge_tight);
 }
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 57aa59e5..366b992b 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -295,6 +295,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always);
 	TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always);
+	TEST_MALLCTL_OPT(uint64_t, hpa_peak_demand_window_ms, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);
diff --git a/test/unit/peak_demand.c b/test/unit/peak_demand.c
new file mode 100644
index 00000000..ca2506b8
--- /dev/null
+++ b/test/unit/peak_demand.c
@@ -0,0 +1,162 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/peak_demand.h"
+
+TEST_BEGIN(test_peak_demand_init) {
+	peak_demand_t peak_demand;
+	/*
+	 * Exact value doesn't matter here as we don't advance epoch in this
+	 * test.
+	 */
+	uint64_t interval_ms = 1000;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 0,
+	    "Unexpected ndirty_max value after initialization");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_basic) {
+	peak_demand_t peak_demand;
+	/* Make each bucket exactly one second to simplify math. */
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 1, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 1024, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_skip_epochs) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ PEAK_DEMAND_NBUCKETS - 1, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2 * (PEAK_DEMAND_NBUCKETS - 1),
+	    /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	/*
+	 * Updates are not evenly spread over time.  When we update at
+	 * 2 * (PEAK_DEMAND_NBUCKETS - 1) second, 1024 value is already out of
+	 * sliding window, but 512 is still present.
+	 */
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_rewrite_optimization) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ UINT64_MAX);
+	/*
+	 * This update should take reasonable time if optimization is working
+	 * correctly, otherwise we'll loop from 0 to UINT64_MAX and this test
+	 * will take a long time to finish.
+	 */
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_out_of_interval) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 1 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	/*
+	 * Updates frequency is lower than tracking interval, so we should
+	 * have only last value.
+	 */
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 256, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_static_epoch) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+	nstime_init_zero(&now);
+
+	/* Big enough value to overwrite values in circular buffer. */
+	size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
+	for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
+		/*
+		 * We should override value in the same bucket as now value
+		 * doesn't change between iterations.
+		 */
+		peak_demand_update(&peak_demand, &now, nactive);
+	}
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_epoch_advance) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+	/* Big enough value to overwrite values in circular buffer. */
+	size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
+	for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
+		uint64_t sec = nactive;
+		nstime_init2(&now, sec, /* nsec */ 0);
+		peak_demand_update(&peak_demand, &now, nactive);
+	}
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_peak_demand_init,
+	    test_peak_demand_update_basic,
+	    test_peak_demand_update_skip_epochs,
+	    test_peak_demand_update_rewrite_optimization,
+	    test_peak_demand_update_out_of_interval,
+	    test_peak_demand_update_static_epoch,
+	    test_peak_demand_update_epoch_advance);
+}