Extend purging algorithm with peak demand tracking

Implementation inspired by idea described in "Beyond malloc efficiency to fleet efficiency: a hugepage-aware memory allocator" paper [1]. Primary idea is to track maximum number (peak) of active pages in use with sliding window and then use this number to decide how many dirty pages we would like to keep. We are trying to estimate maximum amount of active memory we'll need in the near future. We do so by projecting future active memory demand (based on peak active memory usage we observed in the past within sliding window) and adding slack on top of it (an overhead is reasonable to have in exchange of higher hugepages coverage). When peak demand tracking is off, projection of future active memory is active memory we are having right now. Estimation is essentially the same as `nactive_max * (1 + dirty_mult)`. Peak demand purging algorithm controlled by two config options. Option `hpa_peak_demand_window_ms` controls duration of sliding window we track maximum active memory usage in and option `hpa_dirty_mult` controls amount of slack we are allowed to have as a percent from maximum active memory usage. By default `hpa_peak_demand_window_ms == 0` now and we have same behaviour (ratio based purging) that we had before this commit. [1]: https://storage.googleapis.com/gweb-research2023-media/pubtools/6170.pdf
2026-05-06 19:57:29 +03:00 · 2025-01-21 07:20:15 -08:00 · 2025-01-21 07:20:15 -08:00 · d36aa77e8a
commit d36aa77e8a
parent 499f306859
20 changed files with 537 additions and 29 deletions
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@ -37,26 +37,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* experimental_max_purge_nhp */
-	-1
-};
-
-static hpa_shard_opts_t test_hpa_shard_opts_purge = {
-	/* slab_max_alloc */
-	HUGEPAGE,
-	/* hugification_threshold */
-	0.9 * HUGEPAGE,
-	/* dirty_mult */
-	FXP_INIT_PERCENT(11),
-	/* deferral_allowed */
-	true,
-	/* hugify_delay_ms */
-	0,
-	/* hugify_sync */
-	false,
-	/* min_purge_interval_ms */
-	5 * 1000,
-	/* experimental_max_purge_nhp */
-	-1
+	-1,
+	/* peak_demand_window_ms */
+	0
 };

 static hpa_shard_t *
@ -480,8 +463,14 @@ TEST_END
 TEST_BEGIN(test_purge_no_infinite_loop) {
 	test_skip_if(!hpa_supported());

-	hpa_shard_t *shard = create_test_data(&hpa_hooks_default,
-	    &test_hpa_shard_opts_purge);
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.slab_max_alloc = HUGEPAGE;
+	opts.hugification_threshold = 0.9 * HUGEPAGE;
+	opts.dirty_mult = FXP_INIT_PERCENT(11);
+	opts.deferral_allowed = true;
+	opts.hugify_delay_ms = 0;
+
+	hpa_shard_t *shard = create_test_data(&hpa_hooks_default, &opts);
 	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());

 	/*
@ -489,8 +478,7 @@ TEST_BEGIN(test_purge_no_infinite_loop) {
 	 * criteria for huge page and at the same time do not allow hugify page
 	 * without triggering a purge.
 	 */
-	const size_t npages =
-	    test_hpa_shard_opts_purge.hugification_threshold / PAGE + 1;
+	const size_t npages = opts.hugification_threshold / PAGE + 1;
 	const size_t size = npages * PAGE;

 	bool deferred_work_generated = false;
@ -733,6 +721,140 @@ TEST_BEGIN(test_experimental_max_purge_nhp) {
 }
 TEST_END

+TEST_BEGIN(test_demand_purge_slack) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	/* Allow 10% of slack. */
+	opts.dirty_mult = FXP_INIT_PERCENT(10);
+	/* Peak demand sliding window duration is 10 seconds. */
+	opts.peak_demand_window_ms = 10 * 1000;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 16 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+
+	/* Deallocate 5 hugepages out of 16. */
+	for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	nstime_init2(&defer_curtime, 6, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Peak demand within sliding window is 16 hugepages, so we don't need
+	 * to purge anything just yet.
+	 */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	nstime_init2(&defer_curtime, 12, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
+	ndefer_hugify_calls = 0;
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * 12 seconds passed now, peak demand is 11 hugepages, we allowed to
+	 * keep 11 * 0.1 (hpa_dirty_mult) = 1.1 dirty hugepages, but we
+	 * have 5 dirty hugepages, so we should purge 4 of them.
+	 */
+	expect_zu_eq(4, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	destroy_test_data(shard);
+}
+TEST_END
+
+TEST_BEGIN(test_demand_purge_tight) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	/* No slack allowed. */
+	opts.dirty_mult = FXP_INIT_PERCENT(0);
+	/* Peak demand sliding window duration is 10 seconds. */
+	opts.peak_demand_window_ms = 10 * 1000;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 16 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+
+	/* Deallocate 5 hugepages out of 16. */
+	for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	nstime_init2(&defer_curtime, 6, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Peak demand within sliding window is 16 hugepages, to purge anything
+	 * just yet.
+	 */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	nstime_init2(&defer_curtime, 12, 0);
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
+	ndefer_hugify_calls = 0;
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * 12 seconds passed now, peak demand is 11 hugepages.  We have
+	 * hpa_dirty_mult = 0, so we allowed to keep 11 * 0 = 0 dirty
+	 * hugepages, but we have 5, all of them should be purged.
+	 */
+	expect_zu_eq(5, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	destroy_test_data(shard);
+}
+TEST_END
+
 int
 main(void) {
 	/*
@ -756,5 +878,7 @@ main(void) {
 	    test_no_min_purge_interval,
 	    test_min_purge_interval,
 	    test_purge,
-	    test_experimental_max_purge_nhp);
+	    test_experimental_max_purge_nhp,
+	    test_demand_purge_slack,
+	    test_demand_purge_tight);
 }
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@ -295,6 +295,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always);
 	TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always);
+	TEST_MALLCTL_OPT(uint64_t, hpa_peak_demand_window_ms, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);
--- a/test/unit/peak_demand.c
+++ b/test/unit/peak_demand.c
@ -0,0 +1,162 @@
+#include "test/jemalloc_test.h"
+
+#include "jemalloc/internal/peak_demand.h"
+
+TEST_BEGIN(test_peak_demand_init) {
+	peak_demand_t peak_demand;
+	/*
+	 * Exact value doesn't matter here as we don't advance epoch in this
+	 * test.
+	 */
+	uint64_t interval_ms = 1000;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 0,
+	    "Unexpected ndirty_max value after initialization");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_basic) {
+	peak_demand_t peak_demand;
+	/* Make each bucket exactly one second to simplify math. */
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 1, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 1024, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_skip_epochs) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ PEAK_DEMAND_NBUCKETS - 1, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2 * (PEAK_DEMAND_NBUCKETS - 1),
+	    /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	/*
+	 * Updates are not evenly spread over time.  When we update at
+	 * 2 * (PEAK_DEMAND_NBUCKETS - 1) second, 1024 value is already out of
+	 * sliding window, but 512 is still present.
+	 */
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_rewrite_optimization) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 0, /* nsec */ UINT64_MAX);
+	/*
+	 * This update should take reasonable time if optimization is working
+	 * correctly, otherwise we'll loop from 0 to UINT64_MAX and this test
+	 * will take a long time to finish.
+	 */
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_out_of_interval) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+
+	nstime_init2(&now, /* sec */ 0 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
+
+	nstime_init2(&now, /* sec */ 1 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 512);
+
+	nstime_init2(&now, /* sec */ 2 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
+	peak_demand_update(&peak_demand, &now, /* nactive */ 256);
+
+	/*
+	 * Updates frequency is lower than tracking interval, so we should
+	 * have only last value.
+	 */
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), 256, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_static_epoch) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+	nstime_init_zero(&now);
+
+	/* Big enough value to overwrite values in circular buffer. */
+	size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
+	for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
+		/*
+		 * We should override value in the same bucket as now value
+		 * doesn't change between iterations.
+		 */
+		peak_demand_update(&peak_demand, &now, nactive);
+	}
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
+}
+TEST_END
+
+TEST_BEGIN(test_peak_demand_update_epoch_advance) {
+	peak_demand_t peak_demand;
+	uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
+	peak_demand_init(&peak_demand, interval_ms);
+
+	nstime_t now;
+	/* Big enough value to overwrite values in circular buffer. */
+	size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
+	for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
+		uint64_t sec = nactive;
+		nstime_init2(&now, sec, /* nsec */ 0);
+		peak_demand_update(&peak_demand, &now, nactive);
+	}
+
+	expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
+}
+TEST_END
+
+int
+main(void) {
+	return test_no_reentrancy(
+	    test_peak_demand_init,
+	    test_peak_demand_update_basic,
+	    test_peak_demand_update_skip_epochs,
+	    test_peak_demand_update_rewrite_optimization,
+	    test_peak_demand_update_out_of_interval,
+	    test_peak_demand_update_static_epoch,
+	    test_peak_demand_update_epoch_advance);
+}