From cfa90dfd80c4b3ca2b2678fb55cfc718bd9f42c6 Mon Sep 17 00:00:00 2001 From: Slobodan Predolac Date: Tue, 8 Apr 2025 09:51:53 -0700 Subject: [PATCH] Refactor hpa purging to prepare for vectorized call across multiple pages --- include/jemalloc/internal/hpa_utils.h | 82 +++++++++++++++++++++++++++ src/hpa.c | 63 +++++--------------- 2 files changed, 97 insertions(+), 48 deletions(-) create mode 100644 include/jemalloc/internal/hpa_utils.h diff --git a/include/jemalloc/internal/hpa_utils.h b/include/jemalloc/internal/hpa_utils.h new file mode 100644 index 00000000..035d3b21 --- /dev/null +++ b/include/jemalloc/internal/hpa_utils.h @@ -0,0 +1,82 @@ +#ifndef JEMALLOC_INTERNAL_HPA_UTILS_H +#define JEMALLOC_INTERNAL_HPA_UTILS_H + +#include "jemalloc/internal/hpa.h" + +#define HPA_MIN_VAR_VEC_SIZE 8 +#ifdef JEMALLOC_HAVE_PROCESS_MADVISE +typedef struct iovec hpa_io_vector_t; +#else +typedef struct { + void *iov_base; + size_t iov_len; +} hpa_io_vector_t; +#endif + +/* Actually invoke hooks. If we fail vectorized, use single purges */ +static void +hpa_try_vectorized_purge( + hpa_shard_t *shard, hpa_io_vector_t *vec, size_t vlen, size_t nbytes) { + bool success = opt_process_madvise_max_batch > 0 + && !shard->central->hooks.vectorized_purge(vec, vlen, nbytes); + if (!success) { + /* On failure, it is safe to purge again (potential perf + * penalty) If kernel can tell exactly which regions + * failed, we could avoid that penalty. + */ + for (size_t i = 0; i < vlen; ++i) { + shard->central->hooks.purge(vec[i].iov_base, vec[i].iov_len); + } + } +} + +/* + * This struct accumulates the regions for process_madvise. + * It invokes the hook when batch limit is reached + */ +typedef struct { + hpa_io_vector_t *vp; + size_t cur; + size_t total_bytes; + size_t capacity; +} hpa_range_accum_t; + +static inline void +hpa_range_accum_init(hpa_range_accum_t *ra, hpa_io_vector_t *v, size_t sz) { + ra->vp = v; + ra->capacity = sz; + ra->total_bytes = 0; + ra->cur = 0; +} + +static inline void +hpa_range_accum_flush(hpa_range_accum_t *ra, hpa_shard_t *shard) { + assert(ra->total_bytes > 0 && ra->cur > 0); + hpa_try_vectorized_purge(shard, ra->vp, ra->cur, ra->total_bytes); + ra->cur = 0; + ra->total_bytes = 0; +} + +static inline void +hpa_range_accum_add( + hpa_range_accum_t *ra, void *addr, size_t sz, hpa_shard_t *shard) { + assert(ra->cur < ra->capacity); + + ra->vp[ra->cur].iov_base = addr; + ra->vp[ra->cur].iov_len = sz; + ra->total_bytes += sz; + ra->cur++; + + if (ra->cur == ra->capacity) { + hpa_range_accum_flush(ra, shard); + } +} + +static inline void +hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_shard_t *shard) { + if (ra->cur > 0) { + hpa_range_accum_flush(ra, shard); + } +} + +#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */ diff --git a/src/hpa.c b/src/hpa.c index adb106cc..c6771352 100644 --- a/src/hpa.c +++ b/src/hpa.c @@ -2,22 +2,13 @@ #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/hpa.h" +#include "jemalloc/internal/hpa_utils.h" #include "jemalloc/internal/fb.h" #include "jemalloc/internal/witness.h" #define HPA_EDEN_SIZE (128 * HUGEPAGE) -#define HPA_MIN_VAR_VEC_SIZE 8 -#ifdef JEMALLOC_HAVE_PROCESS_MADVISE -typedef struct iovec hpa_io_vector_t; -#else -typedef struct { - void *iov_base; - size_t iov_len; -} hpa_io_vector_t; -#endif - static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size, size_t alignment, bool zero, bool guarded, bool frequent_reuse, bool *deferred_work_generated); @@ -432,22 +423,12 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) { return to_hugify != NULL || hpa_should_purge(tsdn, shard); } -/* If we fail vectorized purge, we will do single */ -static void -hpa_try_vectorized_purge(hpa_shard_t *shard, hpa_io_vector_t *vec, - size_t vlen, size_t nbytes) { - bool success = opt_process_madvise_max_batch > 0 - && !shard->central->hooks.vectorized_purge(vec, vlen, nbytes); - if (!success) { - /* On failure, it is safe to purge again (potential perf - * penalty) If kernel can tell exactly which regions - * failed, we could avoid that penalty. - */ - for (size_t i = 0; i < vlen; ++i) { - shard->central->hooks.purge(vec[i].iov_base, - vec[i].iov_len); - } - } +static inline size_t +hpa_process_madvise_max_iovec_len(void) { + assert(opt_process_madvise_max_batch <= + PROCESS_MADVISE_MAX_BATCH_LIMIT); + return opt_process_madvise_max_batch == 0 ? + HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch; } /* Returns whether or not we purged anything. */ @@ -498,38 +479,24 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) { } size_t total_purged = 0; uint64_t purges_this_pass = 0; - - assert(opt_process_madvise_max_batch <= - PROCESS_MADVISE_MAX_BATCH_LIMIT); - size_t len = opt_process_madvise_max_batch == 0 ? - HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch; + + size_t len = hpa_process_madvise_max_iovec_len(); VARIABLE_ARRAY(hpa_io_vector_t, vec, len); + hpa_range_accum_t accum; + hpa_range_accum_init(&accum, vec, len); + void *purge_addr; size_t purge_size; - size_t cur = 0; - size_t total_batch_bytes = 0; while (hpdata_purge_next(to_purge, &purge_state, &purge_addr, &purge_size)) { - vec[cur].iov_base = purge_addr; - vec[cur].iov_len = purge_size; total_purged += purge_size; assert(total_purged <= HUGEPAGE); + hpa_range_accum_add(&accum, purge_addr, purge_size, shard); purges_this_pass++; - total_batch_bytes += purge_size; - cur++; - if (cur == len) { - hpa_try_vectorized_purge(shard, vec, len, total_batch_bytes); - assert(total_batch_bytes > 0); - cur = 0; - total_batch_bytes = 0; - } - } - - /* Batch was not full */ - if (cur > 0) { - hpa_try_vectorized_purge(shard, vec, cur, total_batch_bytes); } + /* If batch was not full, finish */ + hpa_range_accum_finish(&accum, shard); malloc_mutex_lock(tsdn, &shard->mtx); /* The shard updates */