From aaa29003ab90b574c29dc4c0c331085c07f1c1fd Mon Sep 17 00:00:00 2001
From: Dmitry Ilvokhin <d@ilvokhin.com>
Date: Tue, 6 Aug 2024 08:47:57 -0700
Subject: [PATCH] Limit maximum number of purged slabs with option

Option `experimental_hpa_max_purge_nhp` introduced for backward
compatibility reasons: to make it possible to have behaviour similar
to buggy `hpa_strict_min_purge_interval` implementation.

When `experimental_hpa_max_purge_nhp` is set to -1, there is no limit
to number of slabs we'll purge on each iteration. Otherwise, we'll purge
no more than `experimental_hpa_max_purge_nhp` hugepages (slabs). This in
turn means we might not purge enough dirty pages to satisfy
`hpa_dirty_mult` requirement.

Combination of `hpa_dirty_mult`, `experimental_hpa_max_purge_nhp` and
`hpa_strict_min_purge_interval` options allows us to have steady rate of
pages returned back to the system. This provides a strickier latency
guarantees as number of `madvise` calls is bounded (and hence number of
TLB shootdowns is limited) in exchange to weaker memory usage
guarantees.
---
 include/jemalloc/internal/hpa_opts.h |  9 +++-
 src/ctl.c                            |  8 ++-
 src/hpa.c                            | 17 ++++++-
 src/jemalloc.c                       |  4 ++
 src/stats.c                          |  1 +
 test/unit/hpa.c                      | 75 ++++++++++++++++++++++++++--
 test/unit/mallctl.c                  |  1 +
 7 files changed, 109 insertions(+), 6 deletions(-)

diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h
index 93add641..bf3de0e9 100644
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@@ -57,6 +57,11 @@ struct hpa_shard_opts_s {
 	 * purging logic fix.
 	 */
 	bool strict_min_purge_interval;
+
+	/*
+	 * Maximum number of hugepages to purge on each purging attempt.
+	 */
+	ssize_t experimental_max_purge_nhp;
 };
 
 #define HPA_SHARD_OPTS_DEFAULT {					\
@@ -79,7 +84,9 @@ struct hpa_shard_opts_s {
 	/* min_purge_interval_ms */					\
 	5 * 1000,							\
 	/* strict_min_purge_interval */					\
-	false								\
+	false,								\
+	/* experimental_max_purge_nhp */				\
+	-1								\
 }
 
 #endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */
diff --git a/src/ctl.c b/src/ctl.c
index ebe5c61c..a01f643e 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -104,6 +104,7 @@ CTL_PROTO(opt_hpa_hugification_threshold)
 CTL_PROTO(opt_hpa_hugify_delay_ms)
 CTL_PROTO(opt_hpa_min_purge_interval_ms)
 CTL_PROTO(opt_hpa_strict_min_purge_interval)
+CTL_PROTO(opt_experimental_hpa_max_purge_nhp)
 CTL_PROTO(opt_hpa_dirty_mult)
 CTL_PROTO(opt_hpa_sec_nshards)
 CTL_PROTO(opt_hpa_sec_max_alloc)
@@ -460,7 +461,10 @@ static const ctl_named_node_t opt_node[] = {
 		CTL(opt_hpa_hugification_threshold)},
 	{NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)},
 	{NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)},
-	{NAME("hpa_strict_min_purge_interval"), CTL(opt_hpa_strict_min_purge_interval)},
+	{NAME("hpa_strict_min_purge_interval"),
+		CTL(opt_hpa_strict_min_purge_interval)},
+	{NAME("experimental_hpa_max_purge_nhp"),
+		CTL(opt_experimental_hpa_max_purge_nhp)},
 	{NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)},
 	{NAME("hpa_sec_nshards"),	CTL(opt_hpa_sec_nshards)},
 	{NAME("hpa_sec_max_alloc"),	CTL(opt_hpa_sec_max_alloc)},
@@ -2197,6 +2201,8 @@ CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms,
     uint64_t)
 CTL_RO_NL_GEN(opt_hpa_strict_min_purge_interval,
     opt_hpa_opts.strict_min_purge_interval, bool)
+CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp,
+    opt_hpa_opts.experimental_max_purge_nhp, ssize_t)
 
 /*
  * This will have to change before we publicly document this option; fxp_t and
diff --git a/src/hpa.c b/src/hpa.c
index d3b9c6c2..0410fefc 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -552,7 +552,22 @@ hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard,
 	 * too frequently.
 	 */
 	if (hpa_min_purge_interval_passed(tsdn, shard)) {
-		while (hpa_should_purge(tsdn, shard) && nops < max_ops) {
+		size_t max_purges = max_ops;
+		/*
+		 * Limit number of hugepages (slabs) to purge.
+		 * When experimental_max_purge_nhp option is used, there is no
+		 * guarantee we'll always respect dirty_mult option.  Option
+		 * experimental_max_purge_nhp provides a way to configure same
+		 * behaviour as was possible before, with buggy implementation
+		 * of purging algorithm.
+		 */
+		ssize_t max_purge_nhp = shard->opts.experimental_max_purge_nhp;
+		if (max_purge_nhp != -1 &&
+		    max_purges > (size_t)max_purge_nhp) {
+			max_purges = max_purge_nhp;
+		}
+
+		while (hpa_should_purge(tsdn, shard) && nops < max_purges) {
 			if (!hpa_try_purge(tsdn, shard)) {
 				/*
 				 * It is fine if we couldn't purge as sometimes
diff --git a/src/jemalloc.c b/src/jemalloc.c
index abd7540f..4859cff6 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1558,6 +1558,10 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 			    opt_hpa_opts.strict_min_purge_interval,
 			    "hpa_strict_min_purge_interval");
 
+			CONF_HANDLE_SSIZE_T(
+			    opt_hpa_opts.experimental_max_purge_nhp,
+			    "experimental_hpa_max_purge_nhp", -1, SSIZE_MAX);
+
 			if (CONF_MATCH("hpa_dirty_mult")) {
 				if (CONF_MATCH_VALUE("-1")) {
 					opt_hpa_opts.dirty_mult = (fxp_t)-1;
diff --git a/src/stats.c b/src/stats.c
index fbfacabf..a5c3f0fe 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -1565,6 +1565,7 @@ stats_general_print(emitter_t *emitter) {
 	OPT_WRITE_UINT64("hpa_hugify_delay_ms")
 	OPT_WRITE_UINT64("hpa_min_purge_interval_ms")
 	OPT_WRITE_BOOL("hpa_strict_min_purge_interval")
+	OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp")
 	if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0)
 	    == 0) {
 		/*
diff --git a/test/unit/hpa.c b/test/unit/hpa.c
index 2c11e0a8..4f15876b 100644
--- a/test/unit/hpa.c
+++ b/test/unit/hpa.c
@@ -35,7 +35,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* strict_min_purge_interval */
-	false
+	false,
+	/* experimental_max_purge_nhp */
+	-1
 };
 
 static hpa_shard_opts_t test_hpa_shard_opts_purge = {
@@ -52,7 +54,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = {
 	/* min_purge_interval_ms */
 	5 * 1000,
 	/* strict_min_purge_interval */
-	false
+	false,
+	/* experimental_max_purge_nhp */
+	-1
 };
 
 static hpa_shard_t *
@@ -653,6 +657,70 @@ TEST_BEGIN(test_purge) {
 }
 TEST_END
 
+TEST_BEGIN(test_experimental_max_purge_nhp) {
+	test_skip_if(!hpa_supported());
+
+	hpa_hooks_t hooks;
+	hooks.map = &defer_test_map;
+	hooks.unmap = &defer_test_unmap;
+	hooks.purge = &defer_test_purge;
+	hooks.hugify = &defer_test_hugify;
+	hooks.dehugify = &defer_test_dehugify;
+	hooks.curtime = &defer_test_curtime;
+	hooks.ms_since = &defer_test_ms_since;
+
+	hpa_shard_opts_t opts = test_hpa_shard_opts_default;
+	opts.deferral_allowed = true;
+	opts.experimental_max_purge_nhp = 1;
+
+	hpa_shard_t *shard = create_test_data(&hooks, &opts);
+
+	bool deferred_work_generated = false;
+
+	nstime_init(&defer_curtime, 0);
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+	enum {NALLOCS = 8 * HUGEPAGE_PAGES};
+	edata_t *edatas[NALLOCS];
+	for (int i = 0; i < NALLOCS; i++) {
+		edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
+		    false, false, &deferred_work_generated);
+		expect_ptr_not_null(edatas[i], "Unexpected null edata");
+	}
+	/* Deallocate 3 hugepages out of 8. */
+	for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) {
+		pai_dalloc(tsdn, &shard->pai, edatas[i],
+		    &deferred_work_generated);
+	}
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/*
+	 * Expect only one purge call, because opts.experimental_max_purge_nhp
+	 * is set to 1.
+	 */
+	expect_zu_eq(1, ndefer_purge_calls, "Expect purges");
+	ndefer_purge_calls = 0;
+
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/* We still above the limit for dirty pages. */
+	expect_zu_eq(1, ndefer_purge_calls, "Expect purge");
+	ndefer_purge_calls = 0;
+
+	hpa_shard_do_deferred_work(tsdn, shard);
+
+	expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
+	expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
+	/* Finally, we are below the limit, no purges are expected. */
+	expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
+
+	destroy_test_data(shard);
+}
+TEST_END
+
 int
 main(void) {
 	/*
@@ -675,5 +743,6 @@ main(void) {
 	    test_purge_no_infinite_loop,
 	    test_strict_no_min_purge_interval,
 	    test_strict_min_purge_interval,
-	    test_purge);
+	    test_purge,
+	    test_experimental_max_purge_nhp);
 }
diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c
index 84cd3995..ffe5c411 100644
--- a/test/unit/mallctl.c
+++ b/test/unit/mallctl.c
@@ -292,6 +292,7 @@ TEST_BEGIN(test_mallctl_opt) {
 	TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always);
 	TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always);
+	TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always);
 	TEST_MALLCTL_OPT(unsigned, narenas, always);
 	TEST_MALLCTL_OPT(const char *, percpu_arena, always);
 	TEST_MALLCTL_OPT(size_t, oversize_threshold, always);