Add a page-allocator microbenchmark.

2026-06-02 02:04:20 +03:00 · 2025-08-24 23:57:26 -07:00 · 2025-08-24 23:57:26 -07:00 · 65ac0e2799
commit 65ac0e2799
parent 301b69ea27
3 changed files with 797 additions and 3 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -570,11 +570,15 @@ $(objroot)test/stress/pa/pa_data_preprocessor$(EXE): $(objroot)test/stress/pa/pa
 	@mkdir -p $(@D)
 	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

+$(objroot)test/stress/pa/pa_microbench$(EXE): $(objroot)test/stress/pa/pa_microbench.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
 $(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.c
 	@mkdir -p $(@D)
-	$(CC) $(CFLAGS) -c $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
 ifdef CC_MM
-	@$(CC) -MM $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
+	@$(CC) -MM $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
 endif

 $(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.cpp
@ -704,7 +708,7 @@ tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
 tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
 tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
 tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
-tests_pa: $(objroot)test/stress/pa/pa_data_preprocessor$(EXE)
+tests_pa: $(objroot)test/stress/pa/pa_data_preprocessor$(EXE) $(objroot)test/stress/pa/pa_microbench$(EXE)
 tests: tests_unit tests_integration tests_analyze tests_stress

 check_unit_dir:
--- a/test/stress/pa/README.md
+++ b/test/stress/pa/README.md
@ -0,0 +1,118 @@
+# Page Allocator (PA) Microbenchmark Suite
+
+This directory contains a comprehensive microbenchmark suite for testing and analyzing jemalloc's Page Allocator (PA) system, including the Hugepage-aware Page Allocator (HPA) and Slab Extent Cache (SEC) components.
+
+## Overview
+
+The PA microbenchmark suite consists of two main programs designed to preprocess allocation traces and replay them against jemalloc's internal PA system to measure performance, memory usage, and allocation patterns.
+
+To summarize how to run it, assume we have a file `test/stress/pa/data/hpa.csv` collected from a real application using USDT, the simulation can be run as follows:
+```
+make tests_pa
+./test/stress/pa/pa_data_preprocessor hpa test/stress/pa/data/hpa.csv test/stress/pa/data/sample_hpa_output.csv
+./test/stress/pa/pa_microbench -p -o test/stress/pa/data/sample_hpa_stats.csv test/stress/pa/data/sample_hpa_output.csv
+```
+
+If it's sec, simply replace the first parameter passed to `pa_data_preprocessor` with sec.
+
+## Architecture
+
+### PA System Components
+
+The Page Allocator sits at the core of jemalloc's memory management hierarchy:
+
+```
+Application
+    ↓
+Arena (tcache, bins)
+    ↓
+PA (Page Allocator) ← This is what we benchmark
+    ├── HPA (Hugepage-aware Page Allocator)
+    └── SEC (Slab Extent Cache)
+    ↓
+Extent Management (emap, edata)
+    ↓
+Base Allocator
+    ↓
+OS (mmap/munmap)
+```
+
+### Microbenchmark Architecture
+
+```
+Raw Allocation Traces
+    ↓
+[pa_data_preprocessor] ← Preprocesses and filters traces
+    ↓
+CSV alloc/dalloc Files
+    ↓
+[pa_microbench] ← Replays against real PA system
+    ↓
+Performance Statistics & Analysis
+```
+
+## Programs
+
+### 1. pa_data_preprocessor
+
+A C++ data preprocessing tool that converts raw allocation traces into a standardized CSV format suitable for microbenchmarking.
+
+**Purpose:**
+- Parse and filter raw allocation trace data
+- Convert various trace formats to standardized CSV
+- Filter by process ID, thread ID, or other criteria
+- Validate and clean allocation/deallocation sequences
+
+### 2. pa_microbench
+
+A C microbenchmark that replays allocation traces against jemalloc's actual PA system to measure performance and behavior with HPA statistics collection.
+
+**Purpose:**
+- Initialize real PA infrastructure (HPA, SEC, base allocators, emaps)
+- Replay allocation/deallocation sequences from CSV traces
+- Measure allocation latency, memory usage, and fragmentation
+- Test different PA configurations (HPA-only vs HPA+SEC)
+- Generate detailed HPA internal statistics
+
+**Key Features:**
+- **Real PA Integration**: Uses jemalloc's actual PA implementation, not simulation
+- **Multi-shard Support**: Tests allocation patterns across multiple PA shards
+- **Configurable Modes**: Supports HPA-only mode (`-p`) and HPA+SEC mode (`-s`)
+- **Statistics Output**: Detailed per-shard statistics and timing data
+- **Configurable Intervals**: Customizable statistics output frequency (`-i/--interval`)
+
+## Building
+
+### Compilation
+
+```bash
+# Build both PA microbenchmark tools
+cd /path/to/jemalloc
+make tests_pa
+```
+
+This creates:
+- `test/stress/pa/pa_data_preprocessor` - Data preprocessing tool
+- `test/stress/pa/pa_microbench` - PA microbenchmark
+
+## Usage
+
+### Data Preprocessing
+
+```bash
+# Basic preprocessing
+./test/stress/pa/pa_data_preprocessor <hpa/sec> input_trace.txt output.csv
+```
+
+### Microbenchmark Execution
+
+```bash
+# Run with HPA + SEC (default mode)
+./test/stress/pa/pa_microbench -s -o stats.csv trace.csv
+
+# Run with HPA-only (no SEC)
+./test/stress/pa/pa_microbench -p -o stats.csv trace.csv
+
+# Show help
+./test/stress/pa/pa_microbench -h
+```
--- a/test/stress/pa/pa_microbench.c
+++ b/test/stress/pa/pa_microbench.c
@ -0,0 +1,672 @@
+#include "test/jemalloc_test.h"
+
+/* Additional includes for PA functionality */
+#include "jemalloc/internal/pa.h"
+#include "jemalloc/internal/tsd.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/hpa.h"
+#include "jemalloc/internal/sec.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/psset.h"
+
+/*
+ * PA Microbenchmark (Simplified Version)
+ *
+ * This tool reads allocation traces and simulates PA behavior
+ * for testing and understanding the allocation patterns.
+ *
+ * Features:
+ * 1. Reads CSV input file with format: shard_ind,operation,size_or_alloc_index,is_frequent
+ * 2. Simulates allocations/deallocations tracking
+ * 3. Provides basic statistics analysis
+ * 4. Validates the framework setup
+ */
+
+#define MAX_LINE_LENGTH 1024
+#define MAX_ALLOCATIONS 10000000
+#define MAX_ARENAS 128
+
+typedef enum { PA_ALLOC = 0, PA_DALLOC = 1 } pa_op_t;
+
+typedef struct {
+	int     shard_ind;
+	pa_op_t operation;
+	size_t  size_or_alloc_index;
+	int     is_frequent;
+} pa_event_t;
+
+typedef struct {
+	edata_t *edata;
+	size_t   size;
+	int      shard_ind;
+	bool     active;
+} allocation_record_t;
+
+/* Structure to group per-shard tracking statistics */
+typedef struct {
+	uint64_t alloc_count;     /* Number of allocations */
+	uint64_t dealloc_count;   /* Number of deallocations */
+	uint64_t bytes_allocated; /* Current bytes allocated */
+} shard_stats_t;
+
+/* Structure to group per-shard PA infrastructure */
+typedef struct {
+	base_t          *base;        /* Base allocator */
+	emap_t           emap;        /* Extent map */
+	pa_shard_t       pa_shard;    /* PA shard */
+	pa_shard_stats_t shard_stats; /* PA shard statistics */
+	malloc_mutex_t   stats_mtx;   /* Statistics mutex */
+} shard_infrastructure_t;
+
+static FILE                *g_stats_output = NULL; /* Output file for stats */
+static size_t               g_alloc_counter = 0; /* Global allocation counter */
+static allocation_record_t *g_alloc_records =
+    NULL;                     /* Global allocation tracking */
+static bool g_use_sec = true; /* Global flag for SEC vs HPA-only */
+
+/* Refactored arrays using structures */
+static shard_stats_t *g_shard_stats = NULL; /* Per-shard tracking statistics */
+static shard_infrastructure_t *g_shard_infra =
+    NULL;                         /* Per-shard PA infrastructure */
+static pa_central_t g_pa_central; /* Global PA central */
+
+static void cleanup_pa_infrastructure(int num_shards);
+
+static bool
+initialize_pa_infrastructure(int num_shards) {
+	/*
+	 * Note when we call malloc, it resolves to je_malloc, while internal
+	 * functions like base_new resolve to jet_malloc.  This is because this
+	 * file is compiled with -DJEMALLOC_JET as a test.  This allows us to
+	 * completely isolate the PA infrastructure benchmark from the rest of
+	 * the jemalloc usage.
+	*/
+	void *dummy_jet = jet_malloc(16);
+	if (dummy_jet == NULL) {
+		fprintf(stderr, "Failed to initialize JET jemalloc\n");
+		return 1;
+	}
+
+	/* Force JET system to be fully initialized */
+	if (jet_mallctl("epoch", NULL, NULL, NULL, 0) != 0) {
+		fprintf(stderr, "Failed to initialize JET system fully\n");
+		jet_free(dummy_jet);
+		return 1;
+	}
+	jet_free(dummy_jet);
+
+	/* Allocate shard tracking statistics */
+	g_shard_stats = calloc(num_shards, sizeof(shard_stats_t));
+	if (g_shard_stats == NULL) {
+		printf("DEBUG: Failed to allocate shard stats\n");
+		return true;
+	}
+
+	/* Allocate shard infrastructure */
+	g_shard_infra = calloc(num_shards, sizeof(shard_infrastructure_t));
+	if (g_shard_infra == NULL) {
+		printf("DEBUG: Failed to allocate shard infrastructure\n");
+		free(g_shard_stats);
+		return true;
+	}
+
+	/* Initialize one base allocator for PA central */
+	base_t *central_base = base_new(tsd_tsdn(tsd_fetch()), 0 /* ind */,
+	    (extent_hooks_t *)&ehooks_default_extent_hooks,
+	    /* metadata_use_hooks */ true);
+	if (central_base == NULL) {
+		printf("DEBUG: Failed to create central_base\n");
+		free(g_shard_stats);
+		free(g_shard_infra);
+		return true;
+	}
+
+	/* Initialize PA central with HPA enabled */
+	if (pa_central_init(&g_pa_central, central_base, true /* hpa */,
+	        &hpa_hooks_default)) {
+		printf("DEBUG: Failed to initialize PA central\n");
+		base_delete(tsd_tsdn(tsd_fetch()), central_base);
+		free(g_shard_stats);
+		free(g_shard_infra);
+		return true;
+	}
+
+	for (int i = 0; i < num_shards; i++) {
+		/* Create a separate base allocator for each shard */
+		g_shard_infra[i].base = base_new(tsd_tsdn(tsd_fetch()),
+		    i /* ind */, (extent_hooks_t *)&ehooks_default_extent_hooks,
+		    /* metadata_use_hooks */ true);
+		if (g_shard_infra[i].base == NULL) {
+			printf("DEBUG: Failed to create base %d\n", i);
+			/* Clean up partially initialized shards */
+			cleanup_pa_infrastructure(num_shards);
+			return true;
+		}
+
+		/* Initialize emap for this shard */
+		if (emap_init(&g_shard_infra[i].emap, g_shard_infra[i].base,
+		        /* zeroed */ false)) {
+			printf("DEBUG: Failed to initialize emap %d\n", i);
+			/* Clean up partially initialized shards */
+			cleanup_pa_infrastructure(num_shards);
+			return true;
+		}
+
+		/* Initialize stats mutex */
+		if (malloc_mutex_init(&g_shard_infra[i].stats_mtx,
+		        "pa_shard_stats", WITNESS_RANK_OMIT,
+		        malloc_mutex_rank_exclusive)) {
+			printf(
+			    "DEBUG: Failed to initialize stats mutex %d\n", i);
+			/* Clean up partially initialized shards */
+			cleanup_pa_infrastructure(num_shards);
+			return true;
+		}
+
+		/* Initialize PA shard */
+		nstime_t cur_time;
+		nstime_init_zero(&cur_time);
+
+		if (pa_shard_init(tsd_tsdn(tsd_fetch()),
+		        &g_shard_infra[i].pa_shard, &g_pa_central,
+		        &g_shard_infra[i].emap /* emap */,
+		        g_shard_infra[i].base, i /* ind */,
+		        &g_shard_infra[i].shard_stats /* stats */,
+		        &g_shard_infra[i].stats_mtx /* stats_mtx */,
+		        &cur_time /* cur_time */,
+		        SIZE_MAX /* oversize_threshold */,
+		        -1 /* dirty_decay_ms */, -1 /* muzzy_decay_ms */)) {
+			printf("DEBUG: Failed to initialize PA shard %d\n", i);
+			/* Clean up partially initialized shards */
+			cleanup_pa_infrastructure(num_shards);
+			return true;
+		}
+
+		/* Enable HPA for this shard with proper configuration */
+		hpa_shard_opts_t hpa_opts = HPA_SHARD_OPTS_DEFAULT;
+		hpa_opts.deferral_allowed =
+		    false; /* No background threads in microbench */
+
+		sec_opts_t sec_opts = SEC_OPTS_DEFAULT;
+		if (!g_use_sec) {
+			/* Disable SEC by setting nshards to 0 */
+			sec_opts.nshards = 0;
+		}
+
+		if (pa_shard_enable_hpa(tsd_tsdn(tsd_fetch()),
+		        &g_shard_infra[i].pa_shard, &hpa_opts, &sec_opts)) {
+			fprintf(
+			    stderr, "Failed to enable HPA on shard %d\n", i);
+			/* Clean up partially initialized shards */
+			cleanup_pa_infrastructure(num_shards);
+			return true;
+		}
+	}
+
+	printf("PA infrastructure configured: HPA=enabled, SEC=%s\n",
+	    g_use_sec ? "enabled" : "disabled");
+
+	return false;
+}
+
+static void
+cleanup_pa_infrastructure(int num_shards) {
+	if (g_shard_infra != NULL) {
+		for (int i = 0; i < num_shards; i++) {
+			pa_shard_destroy(
+			    tsd_tsdn(tsd_fetch()), &g_shard_infra[i].pa_shard);
+			if (g_shard_infra[i].base != NULL) {
+				base_delete(tsd_tsdn(tsd_fetch()),
+				    g_shard_infra[i].base);
+			}
+		}
+		free(g_shard_infra);
+		g_shard_infra = NULL;
+	}
+
+	if (g_shard_stats != NULL) {
+		free(g_shard_stats);
+		g_shard_stats = NULL;
+	}
+}
+
+static bool
+parse_csv_line(const char *line, pa_event_t *event) {
+	/* Expected format: shard_ind,operation,size_or_alloc_index,is_frequent */
+	int operation;
+	int fields = sscanf(line, "%d,%d,%zu,%d", &event->shard_ind, &operation,
+	    &event->size_or_alloc_index, &event->is_frequent);
+
+	if (fields < 3) { /* is_frequent is optional */
+		return false;
+	}
+
+	if (fields == 3) {
+		event->is_frequent = 0; /* Default value */
+	}
+
+	if (operation == 0) {
+		event->operation = PA_ALLOC;
+	} else if (operation == 1) {
+		event->operation = PA_DALLOC;
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
+static size_t
+load_trace_file(const char *filename, pa_event_t **events, int *max_shard_id) {
+	FILE *file = fopen(filename, "r");
+	if (!file) {
+		fprintf(stderr, "Failed to open trace file: %s\n", filename);
+		return 0;
+	}
+
+	*events = malloc(MAX_ALLOCATIONS * sizeof(pa_event_t));
+	if (!*events) {
+		fclose(file);
+		return 0;
+	}
+
+	char   line[MAX_LINE_LENGTH];
+	size_t count = 0;
+	*max_shard_id = 0;
+
+	/* Skip header line */
+	if (fgets(line, sizeof(line), file) == NULL) {
+		fclose(file);
+		free(*events);
+		return 0;
+	}
+
+	while (fgets(line, sizeof(line), file) && count < MAX_ALLOCATIONS) {
+		if (parse_csv_line(line, &(*events)[count])) {
+			if ((*events)[count].shard_ind > *max_shard_id) {
+				*max_shard_id = (*events)[count].shard_ind;
+			}
+			count++;
+		}
+	}
+
+	fclose(file);
+	printf("Loaded %zu events from %s\n", count, filename);
+	printf("Maximum shard ID found: %d\n", *max_shard_id);
+	return count;
+}
+
+static void
+collect_hpa_stats(int shard_id, hpa_shard_stats_t *hpa_stats_out) {
+	/* Get tsdn for statistics collection */
+	tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+
+	/* Clear the output structure */
+	memset(hpa_stats_out, 0, sizeof(hpa_shard_stats_t));
+
+	/* Check if this shard has HPA enabled */
+	if (!g_shard_infra[shard_id].pa_shard.ever_used_hpa) {
+		return;
+	}
+
+	/* Merge HPA statistics from the shard */
+	hpa_shard_stats_merge(
+	    tsdn, &g_shard_infra[shard_id].pa_shard.hpa_shard, hpa_stats_out);
+}
+
+static void
+print_shard_stats(int shard_id, size_t operation_count) {
+	if (!g_stats_output) {
+		return;
+	}
+
+	/* Collect HPA statistics */
+	hpa_shard_stats_t hpa_stats;
+	collect_hpa_stats(shard_id, &hpa_stats);
+	psset_stats_t *psset_stats = &hpa_stats.psset_stats;
+
+	/* Total pageslabs */
+	size_t total_pageslabs = psset_stats->merged.npageslabs;
+
+	/* Full pageslabs breakdown by hugification */
+	size_t full_pageslabs_non_huge =
+	    psset_stats->full_slabs[0].npageslabs; /* [0] = non-hugified */
+	size_t full_pageslabs_huge =
+	    psset_stats->full_slabs[1].npageslabs; /* [1] = hugified */
+	size_t full_pageslabs_total = full_pageslabs_non_huge
+	    + full_pageslabs_huge;
+
+	/* Empty pageslabs breakdown by hugification */
+	size_t empty_pageslabs_non_huge =
+	    psset_stats->empty_slabs[0].npageslabs; /* [0] = non-hugified */
+	size_t empty_pageslabs_huge =
+	    psset_stats->empty_slabs[1].npageslabs; /* [1] = hugified */
+	size_t empty_pageslabs_total = empty_pageslabs_non_huge
+	    + empty_pageslabs_huge;
+
+	/* Hugified pageslabs (full + empty + partial) */
+	size_t hugified_pageslabs = full_pageslabs_huge + empty_pageslabs_huge;
+	/* Add hugified partial slabs */
+	for (int i = 0; i < PSSET_NPSIZES; i++) {
+		hugified_pageslabs +=
+		    psset_stats->nonfull_slabs[i][1].npageslabs;
+	}
+
+	/* Dirty bytes */
+	size_t dirty_bytes = psset_stats->merged.ndirty * PAGE;
+
+	/* Output enhanced stats with detailed breakdown */
+	fprintf(g_stats_output,
+	    "%zu,%d,%lu,%lu,%lu,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%lu,%lu,%lu\n",
+	    operation_count, shard_id, g_shard_stats[shard_id].alloc_count,
+	    g_shard_stats[shard_id].dealloc_count,
+	    g_shard_stats[shard_id].bytes_allocated, total_pageslabs,
+	    full_pageslabs_total, empty_pageslabs_total, hugified_pageslabs,
+	    full_pageslabs_non_huge, full_pageslabs_huge,
+	    empty_pageslabs_non_huge, empty_pageslabs_huge, dirty_bytes,
+	    hpa_stats.nonderived_stats.nhugifies,
+	    hpa_stats.nonderived_stats.nhugify_failures,
+	    hpa_stats.nonderived_stats.ndehugifies);
+	fflush(g_stats_output);
+}
+
+static void
+simulate_trace(
+    int num_shards, pa_event_t *events, size_t count, size_t stats_interval) {
+	uint64_t total_allocs = 0, total_deallocs = 0;
+	uint64_t total_allocated_bytes = 0;
+
+	printf("Starting simulation with %zu events across %d shards...\n",
+	    count, num_shards);
+
+	for (size_t i = 0; i < count; i++) {
+		pa_event_t *event = &events[i];
+
+		/* Validate shard index */
+		if (event->shard_ind >= num_shards) {
+			fprintf(stderr,
+			    "Warning: Invalid shard index %d (max %d)\n",
+			    event->shard_ind, num_shards - 1);
+			continue;
+		}
+
+		switch (event->operation) {
+		case PA_ALLOC: {
+			size_t size = event->size_or_alloc_index;
+
+			/* Get tsdn and calculate parameters for PA */
+			tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+			szind_t szind = sz_size2index(size);
+			bool    slab =
+			    event
+			        ->is_frequent; /* Use frequent_reuse for slab */
+			bool deferred_work_generated = false;
+
+			/* Allocate using PA allocator */
+			edata_t *edata = pa_alloc(tsdn,
+			    &g_shard_infra[event->shard_ind].pa_shard, size,
+			    PAGE /* alignment */, slab, szind, false /* zero */,
+			    false /* guarded */, &deferred_work_generated);
+
+			if (edata != NULL) {
+				/* Store allocation record */
+				g_alloc_records[g_alloc_counter].edata = edata;
+				g_alloc_records[g_alloc_counter].size = size;
+				g_alloc_records[g_alloc_counter].shard_ind =
+				    event->shard_ind;
+				g_alloc_records[g_alloc_counter].active = true;
+				g_alloc_counter++;
+
+				/* Update shard-specific stats */
+				g_shard_stats[event->shard_ind].alloc_count++;
+				g_shard_stats[event->shard_ind]
+				    .bytes_allocated += size;
+
+				total_allocs++;
+				total_allocated_bytes += size;
+			}
+			break;
+		}
+		case PA_DALLOC: {
+			size_t alloc_index = event->size_or_alloc_index;
+			if (alloc_index < g_alloc_counter
+			    && g_alloc_records[alloc_index].active
+			    && g_alloc_records[alloc_index].shard_ind
+			        == event->shard_ind) {
+				/* Get tsdn for PA */
+				tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+				bool    deferred_work_generated = false;
+
+				/* Deallocate using PA allocator */
+				pa_dalloc(tsdn,
+				    &g_shard_infra[event->shard_ind].pa_shard,
+				    g_alloc_records[alloc_index].edata,
+				    &deferred_work_generated);
+
+				/* Update shard-specific stats */
+				g_shard_stats[event->shard_ind].dealloc_count++;
+				g_shard_stats[event->shard_ind]
+				    .bytes_allocated -=
+				    g_alloc_records[alloc_index].size;
+
+				g_alloc_records[alloc_index].active = false;
+				total_deallocs++;
+			}
+			break;
+		}
+		}
+
+		/* Periodic stats output and progress reporting */
+		if (stats_interval > 0 && (i + 1) % stats_interval == 0) {
+			/* Print stats for all shards */
+			for (int j = 0; j < num_shards; j++) {
+				print_shard_stats(j, i + 1);
+			}
+		}
+	}
+
+	printf("\nSimulation completed:\n");
+	printf("  Total allocations: %lu\n", total_allocs);
+	printf("  Total deallocations: %lu\n", total_deallocs);
+	printf("  Total allocated: %lu bytes\n", total_allocated_bytes);
+	printf("  Active allocations: %lu\n", g_alloc_counter - total_deallocs);
+
+	/* Print final stats for all shards */
+	printf("\nFinal shard statistics:\n");
+	for (int i = 0; i < num_shards; i++) {
+		printf(
+		    "  Shard %d: Allocs=%lu, Deallocs=%lu, Active Bytes=%lu\n",
+		    i, g_shard_stats[i].alloc_count,
+		    g_shard_stats[i].dealloc_count,
+		    g_shard_stats[i].bytes_allocated);
+
+		/* Final stats to file */
+		print_shard_stats(i, count);
+	}
+}
+
+static void
+cleanup_remaining_allocations(int num_shards) {
+	size_t cleaned_up = 0;
+
+	printf("Cleaning up remaining allocations...\n");
+
+	for (size_t i = 0; i < g_alloc_counter; i++) {
+		if (g_alloc_records[i].active) {
+			int shard_ind = g_alloc_records[i].shard_ind;
+			if (shard_ind < num_shards) {
+				tsdn_t *tsdn = tsd_tsdn(tsd_fetch());
+				bool    deferred_work_generated = false;
+
+				pa_dalloc(tsdn,
+				    &g_shard_infra[shard_ind].pa_shard,
+				    g_alloc_records[i].edata,
+				    &deferred_work_generated);
+
+				g_alloc_records[i].active = false;
+				cleaned_up++;
+			}
+		}
+	}
+
+	printf("Cleaned up %zu remaining allocations\n", cleaned_up);
+}
+
+static void
+print_usage(const char *program) {
+	printf("Usage: %s [options] <trace_file.csv>\n", program);
+	printf("Options:\n");
+	printf("  -h, --help           Show this help message\n");
+	printf(
+	    "  -o, --output FILE    Output file for statistics (default: stdout)\n");
+	printf("  -s, --sec            Use SEC (default)\n");
+	printf("  -p, --hpa-only       Use HPA only (no SEC)\n");
+	printf(
+	    "  -i, --interval N     Stats print interval (default: 100000, 0=disable)\n");
+	printf(
+	    "\nTrace file format: shard_ind,operation,size_or_alloc_index,is_frequent\n");
+	printf("  - operation: 0=alloc, 1=dealloc\n");
+	printf("  - is_frequent: optional column\n");
+}
+
+int
+main(int argc, char *argv[]) {
+	const char *trace_file = NULL;
+	const char *stats_output_file = NULL;
+	size_t      stats_interval = 100000; /* Default stats print interval */
+	/* Parse command line arguments */
+	for (int i = 1; i < argc; i++) {
+		if (strcmp(argv[i], "-h") == 0
+		    || strcmp(argv[i], "--help") == 0) {
+			print_usage(argv[0]);
+			return 0;
+		} else if (strcmp(argv[i], "-o") == 0
+		    || strcmp(argv[i], "--output") == 0) {
+			if (i + 1 >= argc) {
+				fprintf(stderr,
+				    "Error: %s requires an argument\n",
+				    argv[i]);
+				return 1;
+			}
+			stats_output_file = argv[++i];
+		} else if (strcmp(argv[i], "-s") == 0
+		    || strcmp(argv[i], "--sec") == 0) {
+			g_use_sec = true;
+		} else if (strcmp(argv[i], "-p") == 0
+		    || strcmp(argv[i], "--hpa-only") == 0) {
+			g_use_sec = false;
+		} else if (strcmp(argv[i], "-i") == 0
+		    || strcmp(argv[i], "--interval") == 0) {
+			if (i + 1 >= argc) {
+				fprintf(stderr,
+				    "Error: %s requires an argument\n",
+				    argv[i]);
+				return 1;
+			}
+			stats_interval = (size_t)atol(argv[++i]);
+		} else if (argv[i][0] != '-') {
+			trace_file = argv[i];
+		} else {
+			fprintf(stderr, "Unknown option: %s\n", argv[i]);
+			print_usage(argv[0]);
+			return 1;
+		}
+	}
+
+	if (!trace_file) {
+		fprintf(stderr, "Error: No trace file specified\n");
+		print_usage(argv[0]);
+		return 1;
+	}
+
+	printf("Trace file: %s\n", trace_file);
+	printf("Mode: %s\n", g_use_sec ? "PA with SEC" : "HPA only");
+
+	/* Open stats output file */
+	if (stats_output_file) {
+		g_stats_output = fopen(stats_output_file, "w");
+		if (!g_stats_output) {
+			fprintf(stderr,
+			    "Failed to open stats output file: %s\n",
+			    stats_output_file);
+			return 1;
+		}
+		printf("Stats output: %s\n", stats_output_file);
+
+		/* Write CSV header */
+		fprintf(g_stats_output,
+		    "operation_count,shard_id,alloc_count,dealloc_count,active_bytes,"
+		    "total_pageslabs,full_pageslabs_total,empty_pageslabs_total,hugified_pageslabs,"
+		    "full_pageslabs_non_huge,full_pageslabs_huge,"
+		    "empty_pageslabs_non_huge,empty_pageslabs_huge,"
+		    "dirty_bytes,nhugifies,nhugify_failures,ndehugifies\n");
+	}
+
+	/* Load trace data and determine max number of arenas */
+	pa_event_t *events;
+	int         max_shard_id;
+	size_t      event_count = load_trace_file(
+            trace_file, &events, &max_shard_id);
+	if (event_count == 0) {
+		if (g_stats_output)
+			fclose(g_stats_output);
+		return 1;
+	}
+
+	int num_shards = max_shard_id + 1; /* shard IDs are 0-based */
+	if (num_shards > MAX_ARENAS) {
+		fprintf(stderr, "Error: Too many arenas required (%d > %d)\n",
+		    num_shards, MAX_ARENAS);
+		free(events);
+		if (g_stats_output)
+			fclose(g_stats_output);
+		return 1;
+	}
+
+	/* Allocate allocation tracking array */
+	g_alloc_records = malloc(event_count * sizeof(allocation_record_t));
+
+	if (!g_alloc_records) {
+		fprintf(
+		    stderr, "Failed to allocate allocation tracking array\n");
+		free(events);
+		if (g_stats_output) {
+			fclose(g_stats_output);
+		}
+		return 1;
+	}
+
+	/* Initialize PA infrastructure */
+	if (initialize_pa_infrastructure(num_shards)) {
+		fprintf(stderr, "Failed to initialize PA infrastructure\n");
+		free(events);
+		free(g_alloc_records);
+		if (g_stats_output) {
+			fclose(g_stats_output);
+		}
+		return 1;
+	}
+
+	/* Run simulation */
+	simulate_trace(num_shards, events, event_count, stats_interval);
+
+	/* Clean up remaining allocations */
+	cleanup_remaining_allocations(num_shards);
+
+	/* Cleanup PA infrastructure */
+	cleanup_pa_infrastructure(num_shards);
+
+	/* Cleanup */
+	free(g_alloc_records);
+	free(events);
+
+	if (g_stats_output) {
+		fclose(g_stats_output);
+		printf("Statistics written to: %s\n", stats_output_file);
+	}
+
+	return 0;
+}