Extend purging algorithm with peak demand tracking

Implementation inspired by idea described in "Beyond malloc efficiency
to fleet efficiency: a hugepage-aware memory allocator" paper [1].

Primary idea is to track maximum number (peak) of active pages in use
with sliding window and then use this number to decide how many dirty
pages we would like to keep.

We are trying to estimate maximum amount of active memory we'll need in
the near future. We do so by projecting future active memory demand
(based on peak active memory usage we observed in the past within
sliding window) and adding slack on top of it (an overhead is reasonable
to have in exchange of higher hugepages coverage). When peak demand
tracking is off, projection of future active memory is active memory we
are having right now.

Estimation is essentially the same as `nactive_max * (1 + dirty_mult)`.

Peak demand purging algorithm controlled by two config options. Option
`hpa_peak_demand_window_ms` controls duration of sliding window we track
maximum active memory usage in and option `hpa_dirty_mult` controls
amount of slack we are allowed to have as a percent from maximum active
memory usage. By default `hpa_peak_demand_window_ms == 0` now and we
have same behaviour (ratio based purging) that we had before this
commit.

[1]: https://storage.googleapis.com/gweb-research2023-media/pubtools/6170.pdf
This commit is contained in:
Dmitry Ilvokhin 2025-01-21 07:20:15 -08:00
parent 499f306859
commit d36aa77e8a
20 changed files with 537 additions and 29 deletions

View file

@ -37,26 +37,9 @@ static hpa_shard_opts_t test_hpa_shard_opts_default = {
/* min_purge_interval_ms */
5 * 1000,
/* experimental_max_purge_nhp */
-1
};
static hpa_shard_opts_t test_hpa_shard_opts_purge = {
/* slab_max_alloc */
HUGEPAGE,
/* hugification_threshold */
0.9 * HUGEPAGE,
/* dirty_mult */
FXP_INIT_PERCENT(11),
/* deferral_allowed */
true,
/* hugify_delay_ms */
0,
/* hugify_sync */
false,
/* min_purge_interval_ms */
5 * 1000,
/* experimental_max_purge_nhp */
-1
-1,
/* peak_demand_window_ms */
0
};
static hpa_shard_t *
@ -480,8 +463,14 @@ TEST_END
TEST_BEGIN(test_purge_no_infinite_loop) {
test_skip_if(!hpa_supported());
hpa_shard_t *shard = create_test_data(&hpa_hooks_default,
&test_hpa_shard_opts_purge);
hpa_shard_opts_t opts = test_hpa_shard_opts_default;
opts.slab_max_alloc = HUGEPAGE;
opts.hugification_threshold = 0.9 * HUGEPAGE;
opts.dirty_mult = FXP_INIT_PERCENT(11);
opts.deferral_allowed = true;
opts.hugify_delay_ms = 0;
hpa_shard_t *shard = create_test_data(&hpa_hooks_default, &opts);
tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
/*
@ -489,8 +478,7 @@ TEST_BEGIN(test_purge_no_infinite_loop) {
* criteria for huge page and at the same time do not allow hugify page
* without triggering a purge.
*/
const size_t npages =
test_hpa_shard_opts_purge.hugification_threshold / PAGE + 1;
const size_t npages = opts.hugification_threshold / PAGE + 1;
const size_t size = npages * PAGE;
bool deferred_work_generated = false;
@ -733,6 +721,140 @@ TEST_BEGIN(test_experimental_max_purge_nhp) {
}
TEST_END
TEST_BEGIN(test_demand_purge_slack) {
test_skip_if(!hpa_supported());
hpa_hooks_t hooks;
hooks.map = &defer_test_map;
hooks.unmap = &defer_test_unmap;
hooks.purge = &defer_test_purge;
hooks.hugify = &defer_test_hugify;
hooks.dehugify = &defer_test_dehugify;
hooks.curtime = &defer_test_curtime;
hooks.ms_since = &defer_test_ms_since;
hpa_shard_opts_t opts = test_hpa_shard_opts_default;
opts.deferral_allowed = true;
/* Allow 10% of slack. */
opts.dirty_mult = FXP_INIT_PERCENT(10);
/* Peak demand sliding window duration is 10 seconds. */
opts.peak_demand_window_ms = 10 * 1000;
hpa_shard_t *shard = create_test_data(&hooks, &opts);
bool deferred_work_generated = false;
nstime_init(&defer_curtime, 0);
tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
enum {NALLOCS = 16 * HUGEPAGE_PAGES};
edata_t *edatas[NALLOCS];
for (int i = 0; i < NALLOCS; i++) {
edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
false, false, &deferred_work_generated);
expect_ptr_not_null(edatas[i], "Unexpected null edata");
}
/* Deallocate 5 hugepages out of 16. */
for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
pai_dalloc(tsdn, &shard->pai, edatas[i],
&deferred_work_generated);
}
nstime_init2(&defer_curtime, 6, 0);
hpa_shard_do_deferred_work(tsdn, shard);
expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/*
* Peak demand within sliding window is 16 hugepages, so we don't need
* to purge anything just yet.
*/
expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
nstime_init2(&defer_curtime, 12, 0);
hpa_shard_do_deferred_work(tsdn, shard);
expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
ndefer_hugify_calls = 0;
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/*
* 12 seconds passed now, peak demand is 11 hugepages, we allowed to
* keep 11 * 0.1 (hpa_dirty_mult) = 1.1 dirty hugepages, but we
* have 5 dirty hugepages, so we should purge 4 of them.
*/
expect_zu_eq(4, ndefer_purge_calls, "Expect purges");
ndefer_purge_calls = 0;
destroy_test_data(shard);
}
TEST_END
TEST_BEGIN(test_demand_purge_tight) {
test_skip_if(!hpa_supported());
hpa_hooks_t hooks;
hooks.map = &defer_test_map;
hooks.unmap = &defer_test_unmap;
hooks.purge = &defer_test_purge;
hooks.hugify = &defer_test_hugify;
hooks.dehugify = &defer_test_dehugify;
hooks.curtime = &defer_test_curtime;
hooks.ms_since = &defer_test_ms_since;
hpa_shard_opts_t opts = test_hpa_shard_opts_default;
opts.deferral_allowed = true;
/* No slack allowed. */
opts.dirty_mult = FXP_INIT_PERCENT(0);
/* Peak demand sliding window duration is 10 seconds. */
opts.peak_demand_window_ms = 10 * 1000;
hpa_shard_t *shard = create_test_data(&hooks, &opts);
bool deferred_work_generated = false;
nstime_init(&defer_curtime, 0);
tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
enum {NALLOCS = 16 * HUGEPAGE_PAGES};
edata_t *edatas[NALLOCS];
for (int i = 0; i < NALLOCS; i++) {
edatas[i] = pai_alloc(tsdn, &shard->pai, PAGE, PAGE, false,
false, false, &deferred_work_generated);
expect_ptr_not_null(edatas[i], "Unexpected null edata");
}
/* Deallocate 5 hugepages out of 16. */
for (int i = 0; i < 5 * (int)HUGEPAGE_PAGES; i++) {
pai_dalloc(tsdn, &shard->pai, edatas[i],
&deferred_work_generated);
}
nstime_init2(&defer_curtime, 6, 0);
hpa_shard_do_deferred_work(tsdn, shard);
expect_zu_eq(0, ndefer_hugify_calls, "Hugified too early");
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/*
* Peak demand within sliding window is 16 hugepages, to purge anything
* just yet.
*/
expect_zu_eq(0, ndefer_purge_calls, "Purged too early");
nstime_init2(&defer_curtime, 12, 0);
hpa_shard_do_deferred_work(tsdn, shard);
expect_zu_eq(11, ndefer_hugify_calls, "Expect hugification");
ndefer_hugify_calls = 0;
expect_zu_eq(0, ndefer_dehugify_calls, "Dehugified too early");
/*
* 12 seconds passed now, peak demand is 11 hugepages. We have
* hpa_dirty_mult = 0, so we allowed to keep 11 * 0 = 0 dirty
* hugepages, but we have 5, all of them should be purged.
*/
expect_zu_eq(5, ndefer_purge_calls, "Expect purges");
ndefer_purge_calls = 0;
destroy_test_data(shard);
}
TEST_END
int
main(void) {
/*
@ -756,5 +878,7 @@ main(void) {
test_no_min_purge_interval,
test_min_purge_interval,
test_purge,
test_experimental_max_purge_nhp);
test_experimental_max_purge_nhp,
test_demand_purge_slack,
test_demand_purge_tight);
}

View file

@ -295,6 +295,7 @@ TEST_BEGIN(test_mallctl_opt) {
TEST_MALLCTL_OPT(size_t, hpa_sec_bytes_after_flush, always);
TEST_MALLCTL_OPT(size_t, hpa_sec_batch_fill_extra, always);
TEST_MALLCTL_OPT(ssize_t, experimental_hpa_max_purge_nhp, always);
TEST_MALLCTL_OPT(uint64_t, hpa_peak_demand_window_ms, always);
TEST_MALLCTL_OPT(unsigned, narenas, always);
TEST_MALLCTL_OPT(const char *, percpu_arena, always);
TEST_MALLCTL_OPT(size_t, oversize_threshold, always);

162
test/unit/peak_demand.c Normal file
View file

@ -0,0 +1,162 @@
#include "test/jemalloc_test.h"
#include "jemalloc/internal/peak_demand.h"
TEST_BEGIN(test_peak_demand_init) {
peak_demand_t peak_demand;
/*
* Exact value doesn't matter here as we don't advance epoch in this
* test.
*/
uint64_t interval_ms = 1000;
peak_demand_init(&peak_demand, interval_ms);
expect_zu_eq(peak_demand_nactive_max(&peak_demand), 0,
"Unexpected ndirty_max value after initialization");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_basic) {
peak_demand_t peak_demand;
/* Make each bucket exactly one second to simplify math. */
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
nstime_init2(&now, /* sec */ 1, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 512);
nstime_init2(&now, /* sec */ 2, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 256);
expect_zu_eq(peak_demand_nactive_max(&peak_demand), 1024, "");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_skip_epochs) {
peak_demand_t peak_demand;
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
nstime_init2(&now, /* sec */ PEAK_DEMAND_NBUCKETS - 1, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 512);
nstime_init2(&now, /* sec */ 2 * (PEAK_DEMAND_NBUCKETS - 1),
/* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 256);
/*
* Updates are not evenly spread over time. When we update at
* 2 * (PEAK_DEMAND_NBUCKETS - 1) second, 1024 value is already out of
* sliding window, but 512 is still present.
*/
expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_rewrite_optimization) {
peak_demand_t peak_demand;
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
nstime_init2(&now, /* sec */ 0, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
nstime_init2(&now, /* sec */ 0, /* nsec */ UINT64_MAX);
/*
* This update should take reasonable time if optimization is working
* correctly, otherwise we'll loop from 0 to UINT64_MAX and this test
* will take a long time to finish.
*/
peak_demand_update(&peak_demand, &now, /* nactive */ 512);
expect_zu_eq(peak_demand_nactive_max(&peak_demand), 512, "");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_out_of_interval) {
peak_demand_t peak_demand;
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
nstime_init2(&now, /* sec */ 0 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 1024);
nstime_init2(&now, /* sec */ 1 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 512);
nstime_init2(&now, /* sec */ 2 * PEAK_DEMAND_NBUCKETS, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, /* nactive */ 256);
/*
* Updates frequency is lower than tracking interval, so we should
* have only last value.
*/
expect_zu_eq(peak_demand_nactive_max(&peak_demand), 256, "");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_static_epoch) {
peak_demand_t peak_demand;
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
nstime_init_zero(&now);
/* Big enough value to overwrite values in circular buffer. */
size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
/*
* We should override value in the same bucket as now value
* doesn't change between iterations.
*/
peak_demand_update(&peak_demand, &now, nactive);
}
expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
}
TEST_END
TEST_BEGIN(test_peak_demand_update_epoch_advance) {
peak_demand_t peak_demand;
uint64_t interval_ms = 1000 * PEAK_DEMAND_NBUCKETS;
peak_demand_init(&peak_demand, interval_ms);
nstime_t now;
/* Big enough value to overwrite values in circular buffer. */
size_t nactive_max = 2 * PEAK_DEMAND_NBUCKETS;
for (size_t nactive = 0; nactive <= nactive_max; ++nactive) {
uint64_t sec = nactive;
nstime_init2(&now, sec, /* nsec */ 0);
peak_demand_update(&peak_demand, &now, nactive);
}
expect_zu_eq(peak_demand_nactive_max(&peak_demand), nactive_max, "");
}
TEST_END
int
main(void) {
return test_no_reentrancy(
test_peak_demand_init,
test_peak_demand_update_basic,
test_peak_demand_update_skip_epochs,
test_peak_demand_update_rewrite_optimization,
test_peak_demand_update_out_of_interval,
test_peak_demand_update_static_epoch,
test_peak_demand_update_epoch_advance);
}