From cfa90dfd80c4b3ca2b2678fb55cfc718bd9f42c6 Mon Sep 17 00:00:00 2001
From: Slobodan Predolac <spredolac@fb.com>
Date: Tue, 8 Apr 2025 09:51:53 -0700
Subject: [PATCH] Refactor hpa purging to prepare for vectorized call across
 multiple pages

---
 include/jemalloc/internal/hpa_utils.h | 82 +++++++++++++++++++++++++++
 src/hpa.c                             | 63 +++++---------------
 2 files changed, 97 insertions(+), 48 deletions(-)
 create mode 100644 include/jemalloc/internal/hpa_utils.h

diff --git a/include/jemalloc/internal/hpa_utils.h b/include/jemalloc/internal/hpa_utils.h
new file mode 100644
index 00000000..035d3b21
--- /dev/null
+++ b/include/jemalloc/internal/hpa_utils.h
@@ -0,0 +1,82 @@
+#ifndef JEMALLOC_INTERNAL_HPA_UTILS_H
+#define JEMALLOC_INTERNAL_HPA_UTILS_H
+
+#include "jemalloc/internal/hpa.h"
+
+#define HPA_MIN_VAR_VEC_SIZE 8
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+typedef struct iovec hpa_io_vector_t;
+#else
+typedef struct {
+    void *iov_base;
+    size_t iov_len;
+} hpa_io_vector_t;
+#endif
+
+/* Actually invoke hooks. If we fail vectorized, use single purges */
+static void
+hpa_try_vectorized_purge(
+  hpa_shard_t *shard, hpa_io_vector_t *vec, size_t vlen, size_t nbytes) {
+    bool success = opt_process_madvise_max_batch > 0
+      && !shard->central->hooks.vectorized_purge(vec, vlen, nbytes);
+    if (!success) {
+        /* On failure, it is safe to purge again (potential perf
+         * penalty) If kernel can tell exactly which regions
+         * failed, we could avoid that penalty.
+         */
+        for (size_t i = 0; i < vlen; ++i) {
+            shard->central->hooks.purge(vec[i].iov_base, vec[i].iov_len);
+        }
+    }
+}
+
+/*
+ * This struct accumulates the regions for process_madvise.
+ * It invokes the hook when batch limit is reached
+ */
+typedef struct {
+    hpa_io_vector_t *vp;
+    size_t cur;
+    size_t total_bytes;
+    size_t capacity;
+} hpa_range_accum_t;
+
+static inline void
+hpa_range_accum_init(hpa_range_accum_t *ra, hpa_io_vector_t *v, size_t sz) {
+    ra->vp = v;
+    ra->capacity = sz;
+    ra->total_bytes = 0;
+    ra->cur = 0;
+}
+
+static inline void
+hpa_range_accum_flush(hpa_range_accum_t *ra, hpa_shard_t *shard) {
+    assert(ra->total_bytes > 0 && ra->cur > 0);
+    hpa_try_vectorized_purge(shard, ra->vp, ra->cur, ra->total_bytes);
+    ra->cur = 0;
+    ra->total_bytes = 0;
+}
+
+static inline void
+hpa_range_accum_add(
+  hpa_range_accum_t *ra, void *addr, size_t sz, hpa_shard_t *shard) {
+    assert(ra->cur < ra->capacity);
+
+    ra->vp[ra->cur].iov_base = addr;
+    ra->vp[ra->cur].iov_len = sz;
+    ra->total_bytes += sz;
+    ra->cur++;
+
+    if (ra->cur == ra->capacity) {
+        hpa_range_accum_flush(ra, shard);
+    }
+}
+
+static inline void
+hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_shard_t *shard) {
+    if (ra->cur > 0) {
+        hpa_range_accum_flush(ra, shard);
+    }
+}
+
+#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */
diff --git a/src/hpa.c b/src/hpa.c
index adb106cc..c6771352 100644
--- a/src/hpa.c
+++ b/src/hpa.c
@@ -2,22 +2,13 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"
 
 #include "jemalloc/internal/hpa.h"
+#include "jemalloc/internal/hpa_utils.h"
 
 #include "jemalloc/internal/fb.h"
 #include "jemalloc/internal/witness.h"
 
 #define HPA_EDEN_SIZE (128 * HUGEPAGE)
 
-#define HPA_MIN_VAR_VEC_SIZE 8
-#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
-typedef struct iovec hpa_io_vector_t;
-#else
-typedef struct {
-	void *iov_base;
-	size_t iov_len;
-} hpa_io_vector_t;
-#endif
-
 static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
     size_t alignment, bool zero, bool guarded, bool frequent_reuse,
     bool *deferred_work_generated);
@@ -432,22 +423,12 @@ hpa_shard_has_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard) {
 	return to_hugify != NULL || hpa_should_purge(tsdn, shard);
 }
 
-/* If we fail vectorized purge, we will do single */
-static void
-hpa_try_vectorized_purge(hpa_shard_t *shard, hpa_io_vector_t *vec,
-	size_t vlen, size_t nbytes) {
-	bool success = opt_process_madvise_max_batch > 0
-		&& !shard->central->hooks.vectorized_purge(vec, vlen, nbytes);
-	if (!success) {
-		/* On failure, it is safe to purge again (potential perf
-		 * penalty) If kernel can tell exactly which regions
-		 * failed, we could avoid that penalty.
-		 */
-		for (size_t i = 0; i < vlen; ++i) {
-			shard->central->hooks.purge(vec[i].iov_base,
-				vec[i].iov_len);
-		}
-	}
+static inline size_t
+hpa_process_madvise_max_iovec_len(void) {
+	assert(opt_process_madvise_max_batch <=
+		PROCESS_MADVISE_MAX_BATCH_LIMIT);
+	return opt_process_madvise_max_batch == 0 ?
+		HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch;
 }
 
 /* Returns whether or not we purged anything. */
@@ -498,38 +479,24 @@ hpa_try_purge(tsdn_t *tsdn, hpa_shard_t *shard) {
 	}
 	size_t total_purged = 0;
 	uint64_t purges_this_pass = 0;
-
-	assert(opt_process_madvise_max_batch <=
-		PROCESS_MADVISE_MAX_BATCH_LIMIT);
-	size_t len = opt_process_madvise_max_batch == 0 ?
-		HPA_MIN_VAR_VEC_SIZE : opt_process_madvise_max_batch;
+	
+	size_t len = hpa_process_madvise_max_iovec_len();
 	VARIABLE_ARRAY(hpa_io_vector_t, vec, len);
 
+	hpa_range_accum_t accum;
+	hpa_range_accum_init(&accum, vec, len);
+
 	void *purge_addr;
 	size_t purge_size;
-	size_t cur = 0;
-	size_t total_batch_bytes = 0;
 	while (hpdata_purge_next(to_purge, &purge_state, &purge_addr,
 	    &purge_size)) {
-		vec[cur].iov_base = purge_addr;
-		vec[cur].iov_len = purge_size;
 		total_purged += purge_size;
 		assert(total_purged <= HUGEPAGE);
+		hpa_range_accum_add(&accum, purge_addr, purge_size, shard);
 		purges_this_pass++;
-		total_batch_bytes += purge_size;
-		cur++;
-		if (cur == len) {
-			hpa_try_vectorized_purge(shard, vec, len, total_batch_bytes);
-			assert(total_batch_bytes > 0);
-			cur = 0;
-			total_batch_bytes = 0;
-		}
-	}
-
-	/* Batch was not full */
-	if (cur > 0) {
-		hpa_try_vectorized_purge(shard, vec, cur, total_batch_bytes);
 	}
+	/* If batch was not full, finish */
+	hpa_range_accum_finish(&accum, shard);
 
 	malloc_mutex_lock(tsdn, &shard->mtx);
 	/* The shard updates */