From aaa29003ab90b574c29dc4c0c331085c07f1c1fd Mon Sep 17 00:00:00 2001 From: Dmitry Ilvokhin Date: Tue, 6 Aug 2024 08:47:57 -0700 Subject: [PATCH] Limit maximum number of purged slabs with option Option `experimental_hpa_max_purge_nhp` introduced for backward compatibility reasons: to make it possible to have behaviour similar to buggy `hpa_strict_min_purge_interval` implementation. When `experimental_hpa_max_purge_nhp` is set to -1, there is no limit to number of slabs we'll purge on each iteration. Otherwise, we'll purge no more than `experimental_hpa_max_purge_nhp` hugepages (slabs). This in turn means we might not purge enough dirty pages to satisfy `hpa_dirty_mult` requirement. Combination of `hpa_dirty_mult`, `experimental_hpa_max_purge_nhp` and `hpa_strict_min_purge_interval` options allows us to have steady rate of pages returned back to the system. This provides a strickier latency guarantees as number of `madvise` calls is bounded (and hence number of TLB shootdowns is limited) in exchange to weaker memory usage guarantees. --- include/jemalloc/internal/hpa_opts.h | 9 +++- src/ctl.c | 8 ++- src/hpa.c | 17 ++++++- src/jemalloc.c | 4 ++ src/stats.c | 1 + test/unit/hpa.c | 75 ++++++++++++++++++++++++++-- test/unit/mallctl.c | 1 + 7 files changed, 109 insertions(+), 6 deletions(-) diff --git a/include/jemalloc/internal/hpa_opts.h b/include/jemalloc/internal/hpa_opts.h index 93add641..bf3de0e9 100644 --- a/include/jemalloc/internal/hpa_opts.h +++ b/include/jemalloc/internal/hpa_opts.h @@ -57,6 +57,11 @@ struct hpa_shard_opts_s { * purging logic fix. */ bool strict_min_purge_interval; + + /* + * Maximum number of hugepages to purge on each purging attempt. + */ + ssize_t experimental_max_purge_nhp; }; #define HPA_SHARD_OPTS_DEFAULT { \ @@ -79,7 +84,9 @@ struct hpa_shard_opts_s { /* min_purge_interval_ms */ \ 5 * 1000, \ /* strict_min_purge_interval */ \ - false \ + false, \ + /* experimental_max_purge_nhp */ \ + -1 \ } #endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */ diff --git a/src/ctl.c b/src/ctl.c index ebe5c61c..a01f643e 100644 --- a/src/ctl.c +++ b/src/ctl.c @@ -104,6 +104,7 @@ CTL_PROTO(opt_hpa_hugification_threshold) CTL_PROTO(opt_hpa_hugify_delay_ms) CTL_PROTO(opt_hpa_min_purge_interval_ms) CTL_PROTO(opt_hpa_strict_min_purge_interval) +CTL_PROTO(opt_experimental_hpa_max_purge_nhp) CTL_PROTO(opt_hpa_dirty_mult) CTL_PROTO(opt_hpa_sec_nshards) CTL_PROTO(opt_hpa_sec_max_alloc) @@ -460,7 +461,10 @@ static const ctl_named_node_t opt_node[] = { CTL(opt_hpa_hugification_threshold)}, {NAME("hpa_hugify_delay_ms"), CTL(opt_hpa_hugify_delay_ms)}, {NAME("hpa_min_purge_interval_ms"), CTL(opt_hpa_min_purge_interval_ms)}, - {NAME("hpa_strict_min_purge_interval"), CTL(opt_hpa_strict_min_purge_interval)}, + {NAME("hpa_strict_min_purge_interval"), + CTL(opt_hpa_strict_min_purge_interval)}, + {NAME("experimental_hpa_max_purge_nhp"), + CTL(opt_experimental_hpa_max_purge_nhp)}, {NAME("hpa_dirty_mult"), CTL(opt_hpa_dirty_mult)}, {NAME("hpa_sec_nshards"), CTL(opt_hpa_sec_nshards)}, {NAME("hpa_sec_max_alloc"), CTL(opt_hpa_sec_max_alloc)}, @@ -2197,6 +2201,8 @@ CTL_RO_NL_GEN(opt_hpa_min_purge_interval_ms, opt_hpa_opts.min_purge_interval_ms, uint64_t) CTL_RO_NL_GEN(opt_hpa_strict_min_purge_interval, opt_hpa_opts.strict_min_purge_interval, bool) +CTL_RO_NL_GEN(opt_experimental_hpa_max_purge_nhp, + opt_hpa_opts.experimental_max_purge_nhp, ssize_t) /* * This will have to change before we publicly document this option; fxp_t and diff --git a/src/hpa.c b/src/hpa.c index d3b9c6c2..0410fefc 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -552,7 +552,22 @@ hpa_shard_maybe_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard, * too frequently. */ if (hpa_min_purge_interval_passed(tsdn, shard)) { - while (hpa_should_purge(tsdn, shard) && nops < max_ops) { + size_t max_purges = max_ops; + /* + * Limit number of hugepages (slabs) to purge. + * When experimental_max_purge_nhp option is used, there is no + * guarantee we'll always respect dirty_mult option. Option + * experimental_max_purge_nhp provides a way to configure same + * behaviour as was possible before, with buggy implementation + * of purging algorithm. + */ + ssize_t max_purge_nhp = shard->opts.experimental_max_purge_nhp; + if (max_purge_nhp != -1 && + max_purges > (size_t)max_purge_nhp) { + max_purges = max_purge_nhp; + } + + while (hpa_should_purge(tsdn, shard) && nops < max_purges) { if (!hpa_try_purge(tsdn, shard)) { /* * It is fine if we couldn't purge as sometimes diff --git a/src/jemalloc.c b/src/jemalloc.c index abd7540f..4859cff6 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1558,6 +1558,10 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], opt_hpa_opts.strict_min_purge_interval, "hpa_strict_min_purge_interval"); + CONF_HANDLE_SSIZE_T( + opt_hpa_opts.experimental_max_purge_nhp, + "experimental_hpa_max_purge_nhp", -1, SSIZE_MAX); + if (CONF_MATCH("hpa_dirty_mult")) { if (CONF_MATCH_VALUE("-1")) { opt_hpa_opts.dirty_mult = (fxp_t)-1; diff --git a/src/stats.c b/src/stats.c index fbfacabf..a5c3f0fe 100644 --- a/src/stats.c +++ b/src/stats.c @@ -1565,6 +1565,7 @@ stats_general_print(emitter_t *emitter) { OPT_WRITE_UINT64("hpa_hugify_delay_ms") OPT_WRITE_UINT64("hpa_min_purge_interval_ms") OPT_WRITE_BOOL("hpa_strict_min_purge_interval") + OPT_WRITE_SSIZE_T("experimental_hpa_max_purge_nhp") if (je_mallctl("opt.hpa_dirty_mult", (void *)&u32v, &u32sz, NULL, 0) == 0) { /* diff --git a/test/unit/hpa.c b/test/unit/hpa.c index 2c11e0a8..4f15876b 100644 --- a/test/unit/hpa.c +++ b/test/unit/hpa.c @@ -35,7 +35,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = { /* min_purge_interval_ms */ 5 * 1000, /* strict_min_purge_interval */ - false + false, + /* experimental_max_purge_nhp */ + -1 }; static hpa_shard_opts_t test_hpa_shard_opts_purge = { @@ -52,7 +54,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_purge = { /* min_purge_interval_ms */ 5 * 1000, /* strict_min_purge_interval */ - false + false, + /* experimental_max_purge_nhp */ + -1 }; static hpa_shard_t * @@ -653,6 +657,70 @@ TEST_BEGIN(test_purge) { } TEST_END +TEST_BEGIN(test_experimental_max_purge_nhp) { + test_skip_if(!hpa_supported()); + + hpa_hooks_t hooks; + hooks.map = &defer_test_map; + hooks.unmap = &defer_test_unmap; + hooks.purge = &defer_test_purge; + hooks.hugify = &defer_test_hugify; + hooks.dehugify = &defer_test_dehugify; + hooks.curtime = &defer_test_curtime; + hooks.ms_since = &defer_test_ms_since; + + hpa_shard_opts_t opts = test_hpa_shard_opts_default; + opts.deferral_allowed = true; + opts.experimental_max_purge_nhp = 1; + + hpa_shard_t *shard = create_test_data(&hooks, &opts); + + bool deferred_work_generated = false; + + nstime_init(&defer_curtime, 0); + tsdn_t *tsdn = tsd_tsdn(tsd_fetch()); + enum {NALLOCS = 8 * HUGEPAGE_PAGES}; + edata_t *edatas[NALLOCS]; + for (int i = 0; i < NALLOCS; i++) { + edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false, + false, false, &deferred_work_generated); + expect_ptr_not_null(edatas[i], "Unexpected null edata"); + } + /* Deallocate 3 hugepages out of 8. */ + for (int i = 0; i < 3 * (int)HUGEPAGE_PAGES; i++) { + pai_dalloc(tsdn, &shard->pai, edatas[i], + &deferred_work_generated); + } + hpa_shard_do_deferred_work(tsdn, shard); + + expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early"); + expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early"); + /* + * Expect only one purge call, because opts.experimental_max_purge_nhp + * is set to 1. + */ + expect_zu_eq(1, ndefer_purge_calls, "Expect purges"); + ndefer_purge_calls = 0; + + hpa_shard_do_deferred_work(tsdn, shard); + + expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early"); + expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early"); + /* We still above the limit for dirty pages. */ + expect_zu_eq(1, ndefer_purge_calls, "Expect purge"); + ndefer_purge_calls = 0; + + hpa_shard_do_deferred_work(tsdn, shard); + + expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early"); + expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early"); + /* Finally, we are below the limit, no purges are expected. */ + expect_zu_eq(0, ndefer_purge_calls, "Purged too early"); + + destroy_test_data(shard); +} +TEST_END + int main(void) { /* @@ -675,5 +743,6 @@ main(void) { test_purge_no_infinite_loop, test_strict_no_min_purge_interval, test_strict_min_purge_interval, - test_purge); + test_purge, + test_experimental_max_purge_nhp); } diff --git a/test/unit/mallctl.c b/test/unit/mallctl.c index 84cd3995..ffe5c411 100644 --- a/test/unit/mallctl.c +++ b/test/unit/mallctl.c @@ -292,6 +292,7 @@ TEST_BEGIN(test_mallctl_opt) { TEST_MALLCTL_OPT(size_t, hpa_sec_max_bytes, always); TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always); TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always); + TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always); TEST_MALLCTL_OPT(unsigned, narenas, always); TEST_MALLCTL_OPT(const char *, percpu_arena, always); TEST_MALLCTL_OPT(size_t, oversize_threshold, always);