Refactor hpa purging to prepare for vectorized call across multiple pages

This commit is contained in:
Slobodan Predolac 2025-04-08 09:51:53 -07:00 committed by Qi Wang
parent a3910b9802
commit cfa90dfd80
2 changed files with 97 additions and 48 deletions

View file

@ -0,0 +1,82 @@
#ifndef JEMALLOC_INTERNAL_HPA_UTILS_H
#define JEMALLOC_INTERNAL_HPA_UTILS_H
#include "jemalloc/internal/hpa.h"
#define HPA_MIN_VAR_VEC_SIZE 8
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
typedef struct iovec hpa_io_vector_t;
#else
typedef struct {
void *iov_base;
size_t iov_len;
} hpa_io_vector_t;
#endif
/* Actually invoke hooks. If we fail vectorized, use single purges */
static void
hpa_try_vectorized_purge(
hpa_shard_t *shard, hpa_io_vector_t *vec, size_t vlen, size_t nbytes) {
bool success = opt_process_madvise_max_batch > 0
&& !shard->central->hooks.vectorized_purge(vec, vlen, nbytes);
if (!success) {
/* On failure, it is safe to purge again (potential perf
* penalty) If kernel can tell exactly which regions
* failed, we could avoid that penalty.
*/
for (size_t i = 0; i < vlen; ++i) {
shard->central->hooks.purge(vec[i].iov_base, vec[i].iov_len);
}
}
}
/*
* This struct accumulates the regions for process_madvise.
* It invokes the hook when batch limit is reached
*/
typedef struct {
hpa_io_vector_t *vp;
size_t cur;
size_t total_bytes;
size_t capacity;
} hpa_range_accum_t;
static inline void
hpa_range_accum_init(hpa_range_accum_t *ra, hpa_io_vector_t *v, size_t sz) {
ra->vp = v;
ra->capacity = sz;
ra->total_bytes = 0;
ra->cur = 0;
}
static inline void
hpa_range_accum_flush(hpa_range_accum_t *ra, hpa_shard_t *shard) {
assert(ra->total_bytes > 0 && ra->cur > 0);
hpa_try_vectorized_purge(shard, ra->vp, ra->cur, ra->total_bytes);
ra->cur = 0;
ra->total_bytes = 0;
}
static inline void
hpa_range_accum_add(
hpa_range_accum_t *ra, void *addr, size_t sz, hpa_shard_t *shard) {
assert(ra->cur < ra->capacity);
ra->vp[ra->cur].iov_base = addr;
ra->vp[ra->cur].iov_len = sz;
ra->total_bytes += sz;
ra->cur++;
if (ra->cur == ra->capacity) {
hpa_range_accum_flush(ra, shard);
}
}
static inline void
hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_shard_t *shard) {
if (ra->cur > 0) {
hpa_range_accum_flush(ra, shard);
}
}
#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */

View file

@ -2,22 +2,13 @@
#include "jemalloc/internal/jemalloc_internal_includes.h"
#include "jemalloc/internal/hpa.h"
#include "jemalloc/internal/hpa_utils.h"
#include "jemalloc/internal/fb.h"
#include "jemalloc/internal/witness.h"
#define HPA_EDEN_SIZE (128 * HUGEPAGE)
#define HPA_MIN_VAR_VEC_SIZE 8
#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
typedef struct iovec hpa_io_vector_t;
#else
typedef struct {
void *iov_base;
size_t iov_len;
} hpa_io_vector_t;
#endif
static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
size_t alignment, bool zero, bool guarded, bool frequent_reuse,
bool *deferred_work_generated);
@ -432,22 +423,12 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
return to_hugify != NULL || hpa_should_purge(tsdn, shard);
}
/* If we fail vectorized purge, we will do single */
static void
hpa_try_vectorized_purge(hpa_shard_t *shard, hpa_io_vector_t *vec,
size_t vlen, size_t nbytes) {
bool success = opt_process_madvise_max_batch > 0
&& !shard->central->hooks.vectorized_purge(vec, vlen, nbytes);
if (!success) {
/* On failure, it is safe to purge again (potential perf
* penalty) If kernel can tell exactly which regions
* failed, we could avoid that penalty.
*/
for (size_t i = 0; i < vlen; ++i) {
shard->central->hooks.purge(vec[i].iov_base,
vec[i].iov_len);
}
}
static inline size_t
hpa_process_madvise_max_iovec_len(void) {
assert(opt_process_madvise_max_batch <=
PROCESS_MADVISE_MAX_BATCH_LIMIT);
return opt_process_madvise_max_batch == 0 ?
HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch;
}
/* Returns whether or not we purged anything. */
@ -498,38 +479,24 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
}
size_t total_purged = 0;
uint64_t purges_this_pass = 0;
assert(opt_process_madvise_max_batch <=
PROCESS_MADVISE_MAX_BATCH_LIMIT);
size_t len = opt_process_madvise_max_batch == 0 ?
HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch;
size_t len = hpa_process_madvise_max_iovec_len();
VARIABLE_ARRAY(hpa_io_vector_t, vec, len);
hpa_range_accum_t accum;
hpa_range_accum_init(&accum, vec, len);
void *purge_addr;
size_t purge_size;
size_t cur = 0;
size_t total_batch_bytes = 0;
while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
&purge_size)) {
vec[cur].iov_base = purge_addr;
vec[cur].iov_len = purge_size;
total_purged += purge_size;
assert(total_purged <= HUGEPAGE);
hpa_range_accum_add(&accum, purge_addr, purge_size, shard);
purges_this_pass++;
total_batch_bytes += purge_size;
cur++;
if (cur == len) {
hpa_try_vectorized_purge(shard, vec, len, total_batch_bytes);
assert(total_batch_bytes > 0);
cur = 0;
total_batch_bytes = 0;
}
}
/* Batch was not full */
if (cur > 0) {
hpa_try_vectorized_purge(shard, vec, cur, total_batch_bytes);
}
/* If batch was not full, finish */
hpa_range_accum_finish(&accum, shard);
malloc_mutex_lock(tsdn, &shard->mtx);
/* The shard updates */