[thread_event] Remove macros from thread_event and replace with dynamic event objects

2026-05-30 00:27:30 +03:00 · 2025-03-14 06:34:05 -07:00 · 2025-03-14 06:34:05 -07:00 · 153fab2b00
commit 153fab2b00
parent 1972241cd2
17 changed files with 455 additions and 318 deletions
--- a/Makefile.in
+++ b/Makefile.in
@ -156,6 +156,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/tcache.c \
 	$(srcroot)src/test_hooks.c \
 	$(srcroot)src/thread_event.c \
+        $(srcroot)src/thread_event_registry.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
 	$(srcroot)src/util.c \
--- a/include/jemalloc/internal/peak_event.h
+++ b/include/jemalloc/internal/peak_event.h
@ -4,6 +4,14 @@
 #include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/tsd_types.h"

+/*
+ * Update every 64K by default.  We're not exposing this as a configuration
+ * option for now; we don't want to bind ourselves too tightly to any particular
+ * performance requirements for small values, or guarantee that we'll even be
+ * able to provide fine-grained accuracy.
+ */
+#define PEAK_EVENT_WAIT (64 * 1024)
+
 /*
 * While peak.h contains the simple helper struct that tracks state, this
 * contains the allocator tie-ins (and knows about tsd, the event module, etc.).
@ -15,13 +23,6 @@ void peak_event_update(tsd_t *tsd);
 void peak_event_zero(tsd_t *tsd);
 uint64_t peak_event_max(tsd_t *tsd);

-/* Manual hooks. */
-/* The activity-triggered hooks. */
-uint64_t peak_alloc_new_event_wait(tsd_t *tsd);
-uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd);
-void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed);
-uint64_t peak_dalloc_new_event_wait(tsd_t *tsd);
-uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd);
-void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed);
+extern te_base_cb_t peak_te_handler;

 #endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@ -5,6 +5,7 @@
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/prof_hook.h"
+#include "jemalloc/internal/thread_event_registry.h"

 extern bool opt_prof;
 extern bool opt_prof_active;
@ -104,9 +105,65 @@ void prof_prefork1(tsdn_t *tsdn);
 void prof_postfork_parent(tsdn_t *tsdn);
 void prof_postfork_child(tsdn_t *tsdn);

-/* Only accessed by thread event. */
 uint64_t prof_sample_new_event_wait(tsd_t *tsd);
-uint64_t prof_sample_postponed_event_wait(tsd_t *tsd);
-void prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed);
+uint64_t tsd_prof_sample_event_wait_get(tsd_t *tsd);
+
+/*
+ * The lookahead functionality facilitates events to be able to lookahead, i.e.
+ * without touching the event counters, to determine whether an event would be
+ * triggered.  The event counters are not advanced until the end of the
+ * allocation / deallocation calls, so the lookahead can be useful if some
+ * preparation work for some event must be done early in the allocation /
+ * deallocation calls.
+ *
+ * Currently only the profiling sampling event needs the lookahead
+ * functionality, so we don't yet define general purpose lookahead functions.
+ *
+ * Surplus is a terminology referring to the amount of bytes beyond what's
+ * needed for triggering an event, which can be a useful quantity to have in
+ * general when lookahead is being called.
+ *
+ * This function returns true if allocation of usize would go above the next
+ * trigger for prof event, and false otherwise.
+ * If function returns true surplus will contain number of bytes beyond that
+ * trigger.
+ */
+
+JEMALLOC_ALWAYS_INLINE bool
+te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize,
+    size_t *surplus) {
+	if (surplus != NULL) {
+		/*
+		 * This is a dead store: the surplus will be overwritten before
+		 * any read.  The initialization suppresses compiler warnings.
+		 * Meanwhile, using SIZE_MAX to initialize is good for
+		 * debugging purpose, because a valid surplus value is strictly
+		 * less than usize, which is at most SIZE_MAX.
+		 */
+		*surplus = SIZE_MAX;
+	}
+	if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) {
+		return false;
+	}
+	/* The subtraction is intentionally susceptible to underflow. */
+	uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize -
+	    tsd_thread_allocated_last_event_get(tsd);
+	uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd);
+	if (accumbytes < sample_wait) {
+		return false;
+	}
+	assert(accumbytes - sample_wait < (uint64_t)usize);
+	if (surplus != NULL) {
+		*surplus = (size_t)(accumbytes - sample_wait);
+	}
+	return true;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) {
+	return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL);
+}
+
+extern te_base_cb_t prof_sample_te_handler;

 #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */
--- a/include/jemalloc/internal/prof_threshold.h
+++ b/include/jemalloc/internal/prof_threshold.h
@ -3,9 +3,6 @@

 #include "jemalloc/internal/tsd_types.h"

-/* The activity-triggered hooks. */
-uint64_t prof_threshold_new_event_wait(tsd_t *tsd);
-uint64_t prof_threshold_postponed_event_wait(tsd_t *tsd);
-void prof_threshold_event_handler(tsd_t *tsd, uint64_t elapsed);
+extern te_base_cb_t prof_threshold_te_handler;

 #endif /* JEMALLOC_INTERNAL_THRESHOLD_EVENT_H */
--- a/include/jemalloc/internal/stats.h
+++ b/include/jemalloc/internal/stats.h
@ -3,6 +3,7 @@

 #include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/thread_event_registry.h"
 #include "jemalloc/internal/tsd_types.h"

 /*  OPTION(opt,		var_name,	default,	set_value_to) */
@ -43,9 +44,7 @@ extern char opt_stats_interval_opts[stats_print_tot_num_options+1];
 #define STATS_INTERVAL_ACCUM_BATCH_MAX (4 << 20)

 /* Only accessed by thread event. */
-uint64_t stats_interval_new_event_wait(tsd_t *tsd);
-uint64_t stats_interval_postponed_event_wait(tsd_t *tsd);
-void stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed);
+extern te_base_cb_t stats_interval_te_handler;

 /* Implements je_malloc_stats_print. */
 void stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts);
--- a/include/jemalloc/internal/tcache_externs.h
+++ b/include/jemalloc/internal/tcache_externs.h
@ -6,6 +6,7 @@
 #include "jemalloc/internal/cache_bin.h"
 #include "jemalloc/internal/sz.h"
 #include "jemalloc/internal/tcache_types.h"
+#include "jemalloc/internal/thread_event_registry.h"

 extern bool opt_tcache;
 extern size_t opt_tcache_max;
@ -89,4 +90,6 @@ uint64_t tcache_gc_dalloc_new_event_wait(tsd_t *tsd);
 uint64_t tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd);
 void tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed);

+extern te_base_cb_t tcache_gc_te_handler;
+
 #endif /* JEMALLOC_INTERNAL_TCACHE_EXTERNS_H */
--- a/include/jemalloc/internal/thread_event.h
+++ b/include/jemalloc/internal/thread_event.h
@ -49,29 +49,12 @@ void te_event_trigger(tsd_t *tsd, te_ctx_t *ctx);
 void te_recompute_fast_threshold(tsd_t *tsd);
 void tsd_te_init(tsd_t *tsd);

-/*
- * List of all events, in the following format:
- *  E(event,		(condition), is_alloc_event)
- */
-#define ITERATE_OVER_ALL_EVENTS						\
-    E(tcache_gc,		(opt_tcache_gc_incr_bytes > 0), true)	\
-    E(prof_sample,		(config_prof && opt_prof), true)  	\
-    E(prof_threshold,		config_stats, true)  			\
-    E(stats_interval,		(opt_stats_interval >= 0), true)   	\
-    E(tcache_gc_dalloc,		(opt_tcache_gc_incr_bytes > 0), false)	\
-    E(peak_alloc,		config_stats, true)			\
-    E(peak_dalloc,		config_stats, false)
-
-#define E(event, condition_unused, is_alloc_event_unused)		\
-    C(event##_event_wait)
-
 /* List of all thread event counters. */
-#define ITERATE_OVER_ALL_COUNTERS					\
-    C(thread_allocated)							\
-    C(thread_allocated_last_event)					\
-    ITERATE_OVER_ALL_EVENTS						\
-    C(prof_sample_last_event)						\
-    C(stats_interval_last_event)
+#define ITERATE_OVER_ALL_COUNTERS                                       \
+        C(thread_allocated)						\
+	C(thread_allocated_last_event)					\
+	C(prof_sample_last_event)					\
+	C(stats_interval_last_event)

 /* Getters directly wrap TSD getters. */
 #define C(counter)							\
@ -99,12 +82,6 @@ counter##_set(tsd_t *tsd, uint64_t v) {					\
 ITERATE_OVER_ALL_COUNTERS
 #undef C

-/*
- * For generating _event_wait getter / setter functions for each individual
- * event.
- */
-#undef E
-
 /*
 * The malloc and free fastpath getters -- use the unsafe getters since tsd may
 * be non-nominal, in which case the fast_threshold will be set to 0.  This
@ -221,57 +198,6 @@ te_ctx_get(tsd_t *tsd, te_ctx_t *ctx, bool is_alloc) {
 	}
 }

-/*
- * The lookahead functionality facilitates events to be able to lookahead, i.e.
- * without touching the event counters, to determine whether an event would be
- * triggered.  The event counters are not advanced until the end of the
- * allocation / deallocation calls, so the lookahead can be useful if some
- * preparation work for some event must be done early in the allocation /
- * deallocation calls.
- *
- * Currently only the profiling sampling event needs the lookahead
- * functionality, so we don't yet define general purpose lookahead functions.
- *
- * Surplus is a terminology referring to the amount of bytes beyond what's
- * needed for triggering an event, which can be a useful quantity to have in
- * general when lookahead is being called.
- */
-
-JEMALLOC_ALWAYS_INLINE bool
-te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize,
-    size_t *surplus) {
-	if (surplus != NULL) {
-		/*
-		 * This is a dead store: the surplus will be overwritten before
-		 * any read.  The initialization suppresses compiler warnings.
-		 * Meanwhile, using SIZE_MAX to initialize is good for
-		 * debugging purpose, because a valid surplus value is strictly
-		 * less than usize, which is at most SIZE_MAX.
-		 */
-		*surplus = SIZE_MAX;
-	}
-	if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) {
-		return false;
-	}
-	/* The subtraction is intentionally susceptible to underflow. */
-	uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize -
-	    tsd_thread_allocated_last_event_get(tsd);
-	uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd);
-	if (accumbytes < sample_wait) {
-		return false;
-	}
-	assert(accumbytes - sample_wait < (uint64_t)usize);
-	if (surplus != NULL) {
-		*surplus = (size_t)(accumbytes - sample_wait);
-	}
-	return true;
-}
-
-JEMALLOC_ALWAYS_INLINE bool
-te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) {
-	return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL);
-}
-
 JEMALLOC_ALWAYS_INLINE void
 te_event_advance(tsd_t *tsd, size_t usize, bool is_alloc) {
 	te_assert_invariants(tsd);
--- a/include/jemalloc/internal/thread_event_registry.h
+++ b/include/jemalloc/internal/thread_event_registry.h
@ -0,0 +1,58 @@
+#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H
+#define JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd.h"
+
+/* "te" is short for "thread_event" */
+enum te_alloc_e {
+#ifdef JEMALLOC_PROF
+    te_alloc_prof_sample,
+#endif
+    te_alloc_stats_interval,
+#ifdef JEMALLOC_STATS
+    te_alloc_prof_threshold,
+#endif
+    te_alloc_tcache_gc,
+#ifdef JEMALLOC_STATS
+    te_alloc_peak,
+    te_alloc_last = te_alloc_peak,
+#else
+    te_alloc_last = te_alloc_tcache_gc,
+#endif
+    te_alloc_count = te_alloc_last + 1
+};
+typedef enum te_alloc_e te_alloc_t;
+
+enum te_dalloc_e {
+    te_dalloc_tcache_gc,
+#ifdef JEMALLOC_STATS
+    te_dalloc_peak,
+    te_dalloc_last = te_dalloc_peak,
+#else
+    te_dalloc_last = te_dalloc_tcache_gc,
+#endif
+    te_dalloc_count = te_dalloc_last + 1
+};
+typedef enum te_dalloc_e te_dalloc_t;
+
+/* These will live in tsd */
+typedef struct te_data_s te_data_t;
+struct te_data_s {
+	uint64_t alloc_wait[te_alloc_count];
+	uint64_t dalloc_wait[te_dalloc_count];
+};
+#define TE_DATA_INITIALIZER { {0}, {0} }
+
+typedef struct te_base_cb_s te_base_cb_t;
+struct te_base_cb_s {
+    bool (*enabled)(void);
+    uint64_t (*new_event_wait)(tsd_t *tsd);
+    uint64_t (*postponed_event_wait)(tsd_t *tsd);
+    void (*event_handler)(tsd_t *tsd);
+};
+
+extern te_base_cb_t *te_alloc_handlers[te_alloc_count];
+extern te_base_cb_t *te_dalloc_handlers[te_dalloc_count];
+
+#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H */
--- a/include/jemalloc/internal/tsd_internals.h
+++ b/include/jemalloc/internal/tsd_internals.h
@ -15,6 +15,7 @@
 #include "jemalloc/internal/rtree_tsd.h"
 #include "jemalloc/internal/tcache_structs.h"
 #include "jemalloc/internal/tcache_types.h"
+#include "jemalloc/internal/thread_event_registry.h"
 #include "jemalloc/internal/tsd_types.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/witness.h"
@ -68,15 +69,9 @@ typedef ql_elm(tsd_t) tsd_link_t;
    O(thread_allocated_next_event,	uint64_t,	uint64_t)	\
    O(thread_deallocated_last_event,	uint64_t,	uint64_t)	\
    O(thread_deallocated_next_event,	uint64_t,	uint64_t)	\
-    O(tcache_gc_event_wait,	uint64_t,		uint64_t)	\
-    O(tcache_gc_dalloc_event_wait,	uint64_t,	uint64_t)	\
-    O(prof_sample_event_wait,	uint64_t,		uint64_t)	\
+    O(te_data, 			te_data_t,		te_data_t)	\
    O(prof_sample_last_event,	uint64_t,		uint64_t)	\
-    O(prof_threshold_event_wait,	uint64_t,	uint64_t)	\
-    O(stats_interval_event_wait,	uint64_t,	uint64_t)	\
-    O(stats_interval_last_event,	uint64_t,	uint64_t)	\
-    O(peak_alloc_event_wait,	uint64_t,		uint64_t)	\
-    O(peak_dalloc_event_wait,	uint64_t,	uint64_t)		\
+    O(stats_interval_last_event, uint64_t, 		uint64_t)	\
    O(prof_tdata,		prof_tdata_t *,		prof_tdata_t *)	\
    O(prng_state,		uint64_t,		uint64_t)	\
    O(san_extents_until_guard_small,	uint64_t,	uint64_t)	\
@ -102,15 +97,9 @@ typedef ql_elm(tsd_t) tsd_link_t;
    /* thread_allocated_next_event */	0,				\
    /* thread_deallocated_last_event */	0,				\
    /* thread_deallocated_next_event */	0,				\
-    /* tcache_gc_event_wait */		0,				\
-    /* tcache_gc_dalloc_event_wait */	0,				\
-    /* prof_sample_event_wait */	0,				\
+    /* te_data */			TE_DATA_INITIALIZER,   		\
    /* prof_sample_last_event */	0,				\
-    /* prof_threshold_event_wait */	0,				\
-    /* stats_interval_event_wait */	0,				\
    /* stats_interval_last_event */	0,				\
-    /* peak_alloc_event_wait */		0,				\
-    /* peak_dalloc_event_wait */	0,				\
    /* prof_tdata */		NULL,					\
    /* prng_state */		0,					\
    /* san_extents_until_guard_small */	0,				\
--- a/src/peak_event.c
+++ b/src/peak_event.c
@ -5,14 +5,7 @@

 #include "jemalloc/internal/activity_callback.h"
 #include "jemalloc/internal/peak.h"
-
-/*
- * Update every 64K by default.  We're not exposing this as a configuration
- * option for now; we don't want to bind ourselves too tightly to any particular
- * performance requirements for small values, or guarantee that we'll even be
- * able to provide fine-grained accuracy.
- */
-#define PEAK_EVENT_WAIT (64 * 1024)
+#include "jemalloc/internal/thread_event_registry.h"

 /* Update the peak with current tsd state. */
 void
@ -49,34 +42,31 @@ peak_event_max(tsd_t *tsd) {
 	return peak_max(peak);
 }

-uint64_t
-peak_alloc_new_event_wait(tsd_t *tsd) {
+static uint64_t
+peak_event_new_event_wait(tsd_t *tsd) {
 	return PEAK_EVENT_WAIT;
 }

-uint64_t
-peak_alloc_postponed_event_wait(tsd_t *tsd) {
+static uint64_t
+peak_event_postponed_event_wait(tsd_t *tsd) {
 	return TE_MIN_START_WAIT;
 }

-void
-peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed) {
+static void
+peak_event_handler(tsd_t *tsd) {
 	peak_event_update(tsd);
 	peak_event_activity_callback(tsd);
 }

-uint64_t
-peak_dalloc_new_event_wait(tsd_t *tsd) {
-	return PEAK_EVENT_WAIT;
+static bool
+peak_event_enabled(void) {
+	return config_stats;
 }

-uint64_t
-peak_dalloc_postponed_event_wait(tsd_t *tsd) {
-	return TE_MIN_START_WAIT;
-}
-
-void
-peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) {
-	peak_event_update(tsd);
-	peak_event_activity_callback(tsd);
-}
+/* Handles alloc and dalloc */
+te_base_cb_t peak_te_handler = {
+	.enabled = &peak_event_enabled,
+	.new_event_wait = &peak_event_new_event_wait,
+	.postponed_event_wait = &peak_event_postponed_event_wait,
+	.event_handler = &peak_event_handler,
+};
--- a/src/prof.c
+++ b/src/prof.c
@ -12,6 +12,7 @@
 #include "jemalloc/internal/prof_sys.h"
 #include "jemalloc/internal/prof_hook.h"
 #include "jemalloc/internal/thread_event.h"
+#include "jemalloc/internal/thread_event_registry.h"

 /*
 * This file implements the profiling "APIs" needed by other parts of jemalloc,
@ -289,8 +290,40 @@ prof_sample_new_event_wait(tsd_t *tsd) {
 #endif
 }

+void
+prof_sample_event_handler(tsd_t *tsd) {
+	cassert(config_prof);
+	if (prof_interval == 0 || !prof_active_get_unlocked()) {
+		return;
+	}
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+	uint64_t last_sample_event = tsd_prof_sample_last_event_get(tsd);
+	tsd_prof_sample_last_event_set(tsd, last_event);
+	uint64_t elapsed = last_event - last_sample_event;
+	assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED);
+	if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) {
+		prof_idump(tsd_tsdn(tsd));
+	}
+}
+
+static bool
+prof_sample_enabled(void) {
+	return config_prof && opt_prof;
+}
+
 uint64_t
-prof_sample_postponed_event_wait(tsd_t *tsd) {
+tsd_prof_sample_event_wait_get(tsd_t *tsd) {
+#ifdef JEMALLOC_PROF
+	return tsd_te_datap_get_unsafe(tsd)->alloc_wait[te_alloc_prof_sample];
+#else
+	not_reached();
+	return TE_MAX_START_WAIT;
+#endif
+}
+
+te_base_cb_t prof_sample_te_handler = {
+	.enabled = &prof_sample_enabled,
+	.new_event_wait = &prof_sample_new_event_wait,
 	/*
 	 * The postponed wait time for prof sample event is computed as if we
 	 * want a new wait time (i.e. as if the event were triggered).  If we
@ -298,21 +331,10 @@ prof_sample_postponed_event_wait(tsd_t *tsd) {
 	 * handling the other events, then we can have sampling bias, if e.g.
 	 * the allocation immediately following a reentrancy always comes from
 	 * the same stack trace.
-	 */
-	return prof_sample_new_event_wait(tsd);
-}
-
-void
-prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed) {
-	cassert(config_prof);
-	assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED);
-	if (prof_interval == 0 || !prof_active_get_unlocked()) {
-		return;
-	}
-	if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) {
-		prof_idump(tsd_tsdn(tsd));
-	}
-}
+	*/
+	.postponed_event_wait = &prof_sample_new_event_wait,
+	.event_handler = &prof_sample_event_handler,
+};

 static void
 prof_fdump(void) {
--- a/src/prof_threshold.c
+++ b/src/prof_threshold.c
@ -52,6 +52,18 @@ prof_threshold_postponed_event_wait(tsd_t *tsd) {
 }

 void
-prof_threshold_event_handler(tsd_t *tsd, uint64_t elapsed) {
+prof_threshold_event_handler(tsd_t *tsd) {
 	prof_threshold_update(tsd);
 }
+
+static bool
+prof_threshold_enabled(void) {
+	return config_stats;
+}
+
+te_base_cb_t prof_threshold_te_handler = {
+	.enabled = &prof_threshold_enabled,
+	.new_event_wait = &prof_threshold_new_event_wait,
+	.postponed_event_wait = &prof_threshold_postponed_event_wait,
+	.event_handler = &prof_threshold_event_handler,
+};
--- a/src/stats.c
+++ b/src/stats.c
@ -65,7 +65,7 @@ char opt_stats_interval_opts[stats_print_tot_num_options+1] = "";

 static counter_accum_t stats_interval_accumulated;
 /* Per thread batch accum size for stats_interval. */
-static uint64_t stats_interval_accum_batch;
+uint64_t stats_interval_accum_batch;

 /******************************************************************************/

@ -2128,7 +2128,12 @@ stats_interval_postponed_event_wait(tsd_t *tsd) {
 }

 void
-stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) {
+stats_interval_event_handler(tsd_t *tsd) {
+	uint64_t last_event = thread_allocated_last_event_get(tsd);
+	uint64_t last_sample_event = tsd_stats_interval_last_event_get(tsd);
+	tsd_stats_interval_last_event_set(tsd, last_event);
+	uint64_t elapsed = last_event - last_sample_event;
+
 	assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED);
 	if (counter_accum(tsd_tsdn(tsd), &stats_interval_accumulated,
 	    elapsed)) {
@ -2136,6 +2141,19 @@ stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) {
 	}
 }

+static bool
+stats_interval_enabled(void) {
+	return opt_stats_interval >= 0;
+}
+
+te_base_cb_t stats_interval_te_handler = {
+	.enabled = &stats_interval_enabled,
+	.new_event_wait = &stats_interval_new_event_wait,
+	.postponed_event_wait = &stats_interval_postponed_event_wait,
+	.event_handler = &stats_interval_event_handler,
+};
+
+
 bool
 stats_boot(void) {
 	uint64_t stats_interval;
--- a/src/tcache.c
+++ b/src/tcache.c
@ -511,7 +511,7 @@ tcache_try_gc_bin(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache,
 }

 static void
-tcache_event(tsd_t *tsd) {
+tcache_gc_event(tsd_t *tsd) {
 	tcache_t *tcache = tcache_get(tsd);
 	if (tcache == NULL) {
 		return;
@ -581,18 +581,6 @@ tcache_event(tsd_t *tsd) {
 	tcache_slow->next_gc_bin_large = szind_large;
 }

-void
-tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed) {
-	assert(elapsed == TE_INVALID_ELAPSED);
-	tcache_event(tsd);
-}
-
-void
-tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) {
-	assert(elapsed == TE_INVALID_ELAPSED);
-	tcache_event(tsd);
-}
-
 void *
 tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena,
    tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind,
@ -1912,3 +1900,16 @@ tcache_postfork_child(tsdn_t *tsdn) {
 void tcache_assert_initialized(tcache_t *tcache) {
 	assert(!cache_bin_still_zero_initialized(&tcache->bins[0]));
 }
+
+static bool
+tcache_gc_enabled(void) {
+    return (opt_tcache_gc_incr_bytes > 0);
+}
+
+/* Handles alloc and dalloc the same way */
+te_base_cb_t tcache_gc_te_handler = {
+	.enabled = &tcache_gc_enabled,
+	.new_event_wait = &tcache_gc_new_event_wait,
+	.postponed_event_wait = &tcache_gc_postponed_event_wait,
+	.event_handler = &tcache_gc_event,
+};
--- a/src/thread_event.c
+++ b/src/thread_event.c
@ -2,108 +2,46 @@
 #include "jemalloc/internal/jemalloc_internal_includes.h"

 #include "jemalloc/internal/thread_event.h"
-
-/*
- * Signatures for event specific functions.  These functions should be defined
- * by the modules owning each event.  The signatures here verify that the
- * definitions follow the right format.
- *
- * The first two are functions computing new / postponed event wait time.  New
- * event wait time is the time till the next event if an event is currently
- * being triggered; postponed event wait time is the time till the next event
- * if an event should be triggered but needs to be postponed, e.g. when the TSD
- * is not nominal or during reentrancy.
- *
- * The third is the event handler function, which is called whenever an event
- * is triggered.  The parameter is the elapsed time since the last time an
- * event of the same type was triggered.
- */
-#define E(event, condition_unused, is_alloc_event_unused)		\
-uint64_t event##_new_event_wait(tsd_t *tsd);				\
-uint64_t event##_postponed_event_wait(tsd_t *tsd);			\
-void event##_event_handler(tsd_t *tsd, uint64_t elapsed);
-
-ITERATE_OVER_ALL_EVENTS
-#undef E
-
-/* Signatures for internal functions fetching elapsed time. */
-#define E(event, condition_unused, is_alloc_event_unused)		\
-static uint64_t event##_fetch_elapsed(tsd_t *tsd);
-
-ITERATE_OVER_ALL_EVENTS
-#undef E
-
-static uint64_t
-tcache_gc_fetch_elapsed(tsd_t *tsd) {
-	return TE_INVALID_ELAPSED;
-}
-
-static uint64_t
-tcache_gc_dalloc_fetch_elapsed(tsd_t *tsd) {
-	return TE_INVALID_ELAPSED;
-}
-
-static uint64_t
-prof_sample_fetch_elapsed(tsd_t *tsd) {
-	uint64_t last_event = thread_allocated_last_event_get(tsd);
-	uint64_t last_sample_event = prof_sample_last_event_get(tsd);
-	prof_sample_last_event_set(tsd, last_event);
-	return last_event - last_sample_event;
-}
-
-static uint64_t
-stats_interval_fetch_elapsed(tsd_t *tsd) {
-	uint64_t last_event = thread_allocated_last_event_get(tsd);
-	uint64_t last_stats_event = stats_interval_last_event_get(tsd);
-	stats_interval_last_event_set(tsd, last_event);
-	return last_event - last_stats_event;
-}
-
-static uint64_t
-peak_alloc_fetch_elapsed(tsd_t *tsd) {
-	return TE_INVALID_ELAPSED;
-}
-
-static uint64_t
-peak_dalloc_fetch_elapsed(tsd_t *tsd) {
-	return TE_INVALID_ELAPSED;
-}
-
-static uint64_t
-prof_threshold_fetch_elapsed(tsd_t *tsd) {
-	return TE_INVALID_ELAPSED;
-}
-
-/* Per event facilities done. */
+#include "jemalloc/internal/thread_event_registry.h"
+#include "jemalloc/internal/peak_event.h"

 static bool
 te_ctx_has_active_events(te_ctx_t *ctx) {
 	assert(config_debug);
-#define E(event, condition, alloc_event)			       \
-	if (condition && alloc_event == ctx->is_alloc) {	       \
-		return true;					       \
+	if (ctx->is_alloc) {
+		for (int i = 0; i < te_alloc_count; ++i) {
+			if (te_alloc_handlers[i]->enabled()) {
+				return true;
+			}
+		}
+	} else {
+		for (int i = 0; i < te_dalloc_count; ++i) {
+			if (te_dalloc_handlers[i]->enabled()) {
+				return true;
+			}
+		}
 	}
-	ITERATE_OVER_ALL_EVENTS
-#undef E
 	return false;
 }

 static uint64_t
 te_next_event_compute(tsd_t *tsd, bool is_alloc) {
+	te_base_cb_t **handlers = is_alloc ? te_alloc_handlers : te_dalloc_handlers;
+	uint64_t *waits = is_alloc ? tsd_te_datap_get_unsafe(tsd)->alloc_wait : tsd_te_datap_get_unsafe(tsd)->dalloc_wait;
+	int count = is_alloc ? te_alloc_count : te_dalloc_count;
+	
 	uint64_t wait = TE_MAX_START_WAIT;
-#define E(event, condition, alloc_event)				\
-	if (is_alloc == alloc_event && condition) {			\
-		uint64_t event_wait =					\
-		    event##_event_wait_get(tsd);			\
-		assert(event_wait <= TE_MAX_START_WAIT);		\
-		if (event_wait > 0U && event_wait < wait) {		\
-			wait = event_wait;				\
-		}							\
+
+	for (int i = 0; i < count; i++) {
+		if (handlers[i]->enabled()) {
+			uint64_t ev_wait = waits[i];
+			assert(ev_wait <= TE_MAX_START_WAIT);
+			if (ev_wait > 0U && ev_wait < wait) {
+				wait = ev_wait;
+			}
+		}
 	}

-	ITERATE_OVER_ALL_EVENTS
-#undef E
-	assert(wait <= TE_MAX_START_WAIT);
 	return wait;
 }

@ -238,18 +176,132 @@ te_adjust_thresholds_helper(tsd_t *tsd, te_ctx_t *ctx,
 	te_ctx_next_event_set(tsd, ctx, next_event);
 }

-static uint64_t
-te_clip_event_wait(uint64_t event_wait) {
-	assert(event_wait > 0U);
-	if (TE_MIN_START_WAIT > 1U &&
-	    unlikely(event_wait < TE_MIN_START_WAIT)) {
-		event_wait = TE_MIN_START_WAIT;
+static void
+te_init_waits(tsd_t *tsd, uint64_t *wait, bool is_alloc) {
+	te_base_cb_t **handlers = is_alloc ? te_alloc_handlers : te_dalloc_handlers;
+	uint64_t *waits = is_alloc ? tsd_te_datap_get_unsafe(tsd)->alloc_wait : tsd_te_datap_get_unsafe(tsd)->dalloc_wait;
+	int count = is_alloc ? te_alloc_count : te_dalloc_count;
+	for (int i = 0; i < count; i++) {
+		if (handlers[i]->enabled()) {
+			uint64_t ev_wait = handlers[i]->new_event_wait(tsd);
+			assert(ev_wait > 0);
+			waits[i] = ev_wait;
+			if (ev_wait < *wait) {
+				*wait = ev_wait;
+			}
+		}
 	}
-	if (TE_MAX_START_WAIT < UINT64_MAX &&
-	    unlikely(event_wait > TE_MAX_START_WAIT)) {
-		event_wait = TE_MAX_START_WAIT;
+}
+
+static inline bool
+te_update_wait(tsd_t *tsd, uint64_t accumbytes, bool allow,
+	       uint64_t *ev_wait, uint64_t *wait, te_base_cb_t *handler,
+	       uint64_t new_wait) {
+	bool ret = false;
+	if (*ev_wait > accumbytes) {
+                *ev_wait -= accumbytes;
+        } else if (!allow) {
+                *ev_wait = handler->postponed_event_wait(tsd);
+        } else {
+                ret = true;
+                *ev_wait = new_wait == 0 ?
+		    handler->new_event_wait(tsd) :
+		    new_wait;
+        }
+
+        assert(*ev_wait > 0);
+        if (*ev_wait < *wait) {
+                *wait = *ev_wait;
+        }
+	return ret;
+}
+
+extern uint64_t stats_interval_accum_batch;
+/* Return number of handlers enqueued into to_trigger array */
+static inline size_t
+te_update_alloc_events(tsd_t *tsd, te_base_cb_t **to_trigger,
+		       uint64_t accumbytes, bool allow, uint64_t *wait) {
+	/*
+	 * We do not loop and invoke the functions via interface because
+	 * of the perf cost.  This path is relatively hot, so we sacrifice
+	 * elegance for perf.
+	 */
+	size_t nto_trigger = 0;
+	uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->alloc_wait;
+	if (opt_tcache_gc_incr_bytes > 0) {
+		assert(te_alloc_handlers[te_alloc_tcache_gc]->enabled());
+		if (te_update_wait(tsd, accumbytes, allow,
+				   &waits[te_alloc_tcache_gc], wait,
+				   te_alloc_handlers[te_alloc_tcache_gc],
+				   opt_tcache_gc_incr_bytes)) {
+			to_trigger[nto_trigger++] =
+			    te_alloc_handlers[te_alloc_tcache_gc];
+		}
 	}
-	return event_wait;
+#ifdef JEMALLOC_PROF
+        if (opt_prof) {
+		assert(te_alloc_handlers[te_alloc_prof_sample]->enabled());
+		if(te_update_wait(tsd, accumbytes, allow,
+				  &waits[te_alloc_prof_sample], wait,
+				  te_alloc_handlers[te_alloc_prof_sample], 0)) {
+			to_trigger[nto_trigger++] =
+			    te_alloc_handlers[te_alloc_prof_sample];
+		}
+	}
+#endif
+	if (opt_stats_interval >= 0) {
+		if (te_update_wait(tsd, accumbytes, allow,
+				   &waits[te_alloc_stats_interval],
+				   wait,
+				   te_alloc_handlers[te_alloc_stats_interval],
+				   stats_interval_accum_batch)) {
+			assert(te_alloc_handlers[te_alloc_stats_interval]->enabled());
+			to_trigger[nto_trigger++] =
+			    te_alloc_handlers[te_alloc_stats_interval];
+		}
+	}
+
+#ifdef JEMALLOC_STATS
+	assert(te_alloc_handlers[te_alloc_peak]->enabled());
+ 	if(te_update_wait(tsd, accumbytes, allow, &waits[te_alloc_peak], wait,
+			  te_alloc_handlers[te_alloc_peak], PEAK_EVENT_WAIT)) {
+		to_trigger[nto_trigger++] = te_alloc_handlers[te_alloc_peak];
+ 	}
+
+        assert(te_alloc_handlers[te_alloc_prof_threshold]->enabled());
+        if(te_update_wait(tsd, accumbytes, allow,
+			  &waits[te_alloc_prof_threshold], wait,
+			  te_alloc_handlers[te_alloc_prof_threshold],
+			  1 << opt_experimental_lg_prof_threshold)) {
+		to_trigger[nto_trigger++] = te_alloc_handlers[te_alloc_prof_threshold];
+ 	}
+#endif
+	return nto_trigger;
+}
+
+static inline size_t
+te_update_dalloc_events(tsd_t *tsd, te_base_cb_t **to_trigger, uint64_t accumbytes,
+			bool allow, uint64_t *wait) {
+	size_t nto_trigger = 0;
+	uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->dalloc_wait;
+	if (opt_tcache_gc_incr_bytes > 0) {
+		assert(te_dalloc_handlers[te_dalloc_tcache_gc]->enabled());
+		if (te_update_wait(tsd, accumbytes, allow,
+				   &waits[te_dalloc_tcache_gc], wait,
+				   te_dalloc_handlers[te_dalloc_tcache_gc],
+				   opt_tcache_gc_incr_bytes)) {
+			to_trigger[nto_trigger++] =
+			    te_dalloc_handlers[te_dalloc_tcache_gc];
+		}
+        }
+#ifdef JEMALLOC_STATS
+	assert(te_dalloc_handlers[te_dalloc_peak]->enabled());
+        if(te_update_wait(tsd, accumbytes, allow, &waits[te_dalloc_peak], wait,
+			  te_dalloc_handlers[te_dalloc_peak], PEAK_EVENT_WAIT)) {
+		to_trigger[nto_trigger++] = te_dalloc_handlers[te_dalloc_peak];
+ 	}
+#endif
+	return nto_trigger;
 }

 void
@ -263,47 +315,32 @@ te_event_trigger(tsd_t *tsd, te_ctx_t *ctx) {

 	bool allow_event_trigger = tsd_nominal(tsd) &&
 	    tsd_reentrancy_level_get(tsd) == 0;
-	bool is_alloc = ctx->is_alloc;
 	uint64_t wait = TE_MAX_START_WAIT;

-#define E(event, condition, alloc_event)				\
-	bool is_##event##_triggered = false;				\
-	if (is_alloc == alloc_event && condition) {			\
-		uint64_t event_wait = event##_event_wait_get(tsd);	\
-		assert(event_wait <= TE_MAX_START_WAIT);		\
-		if (event_wait > accumbytes) {				\
-			event_wait -= accumbytes;			\
-		} else if (!allow_event_trigger) {			\
-			event_wait = event##_postponed_event_wait(tsd);	\
-		} else {						\
-			is_##event##_triggered = true;			\
-			event_wait = event##_new_event_wait(tsd);	\
-		}							\
-		event_wait = te_clip_event_wait(event_wait);		\
-		event##_event_wait_set(tsd, event_wait);		\
-		if (event_wait < wait) {				\
-			wait = event_wait;				\
-		}							\
+	assert((int)te_alloc_count >= (int) te_dalloc_count);
+	te_base_cb_t *to_trigger[te_alloc_count];
+	size_t nto_trigger;
+	if (ctx->is_alloc) {
+		nto_trigger = te_update_alloc_events(tsd, to_trigger,
+						     accumbytes,
+						     allow_event_trigger,
+						     &wait);
+	} else {
+		nto_trigger = te_update_dalloc_events(tsd, to_trigger,
+						      accumbytes,
+						      allow_event_trigger,
+						      &wait);
 	}

-	ITERATE_OVER_ALL_EVENTS
-#undef E
-
-	assert(wait <= TE_MAX_START_WAIT);
+        assert(wait <= TE_MAX_START_WAIT);
 	te_adjust_thresholds_helper(tsd, ctx, wait);
 	te_assert_invariants(tsd);

-#define E(event, condition, alloc_event)				\
-	if (is_alloc == alloc_event && condition &&			\
-	    is_##event##_triggered) {					\
-		assert(allow_event_trigger);				\
-		uint64_t elapsed = event##_fetch_elapsed(tsd);		\
-		event##_event_handler(tsd, elapsed);			\
+	for (size_t i = 0; i < nto_trigger; i++) {
+		assert(allow_event_trigger);
+		to_trigger[i]->event_handler(tsd);
 	}

-	ITERATE_OVER_ALL_EVENTS
-#undef E
-
 	te_assert_invariants(tsd);
 }

@ -323,18 +360,8 @@ te_init(tsd_t *tsd, bool is_alloc) {
 	te_ctx_last_event_set(&ctx, te_ctx_current_bytes_get(&ctx));

 	uint64_t wait = TE_MAX_START_WAIT;
-#define E(event, condition, alloc_event)				\
-	if (is_alloc == alloc_event && condition) {			\
-		uint64_t event_wait = event##_new_event_wait(tsd);	\
-		event_wait = te_clip_event_wait(event_wait);		\
-		event##_event_wait_set(tsd, event_wait);		\
-		if (event_wait < wait) {				\
-			wait = event_wait;				\
-		}							\
-	}
+	te_init_waits(tsd, &wait, is_alloc);

-	ITERATE_OVER_ALL_EVENTS
-#undef E
 	te_adjust_thresholds_helper(tsd, &ctx, wait);
 }

--- a/src/thread_event_registry.c
+++ b/src/thread_event_registry.c
@ -0,0 +1,37 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_includes.h"
+
+#include "jemalloc/internal/thread_event.h"
+#include "jemalloc/internal/thread_event_registry.h"
+#include "jemalloc/internal/thread_event_registry.h"
+#include "jemalloc/internal/tcache_externs.h"
+#include "jemalloc/internal/peak_event.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/prof_threshold.h"
+#include "jemalloc/internal/stats.h"
+
+
+/* Table of all the thread events.
+ *  Events share interface, but internally they will know thier
+ *  data layout in tsd.
+ */
+te_base_cb_t *te_alloc_handlers[te_alloc_count] = {
+#ifdef JEMALLOC_PROF
+    &prof_sample_te_handler,
+#endif
+    &stats_interval_te_handler,
+#ifdef JEMALLOC_STATS
+    &prof_threshold_te_handler,
+#endif
+    &tcache_gc_te_handler,
+#ifdef JEMALLOC_STATS
+    &peak_te_handler,
+#endif
+};
+
+te_base_cb_t *te_dalloc_handlers[te_dalloc_count] = {
+	&tcache_gc_te_handler,
+#ifdef JEMALLOC_STATS
+	&peak_te_handler,
+#endif
+};
--- a/test/unit/thread_event.c
+++ b/test/unit/thread_event.c
@ -8,12 +8,11 @@ TEST_BEGIN(test_next_event_fast) {
 	te_ctx_last_event_set(&ctx, 0);
 	te_ctx_current_bytes_set(&ctx, TE_NEXT_EVENT_FAST_MAX - 8U);
 	te_ctx_next_event_set(tsd, &ctx, TE_NEXT_EVENT_FAST_MAX);
-#define E(event, condition, is_alloc)					\
-	if (is_alloc && condition) {					\
-		event##_event_wait_set(tsd, TE_NEXT_EVENT_FAST_MAX);	\
+
+	uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->alloc_wait;
+	for (size_t i = 0; i < te_alloc_count; i++) {
+		waits[i] = TE_NEXT_EVENT_FAST_MAX;
 	}
-	ITERATE_OVER_ALL_EVENTS
-#undef E

 	/* Test next_event_fast rolling back to 0. */
 	void *p = malloc(16U);