diff --git a/Makefile.in b/Makefile.in index 7085a22a..2519ed83 100644 --- a/Makefile.in +++ b/Makefile.in @@ -156,6 +156,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \ $(srcroot)src/tcache.c \ $(srcroot)src/test_hooks.c \ $(srcroot)src/thread_event.c \ + $(srcroot)src/thread_event_registry.c \ $(srcroot)src/ticker.c \ $(srcroot)src/tsd.c \ $(srcroot)src/util.c \ diff --git a/include/jemalloc/internal/peak_event.h b/include/jemalloc/internal/peak_event.h index cc2a1401..1e339ff8 100644 --- a/include/jemalloc/internal/peak_event.h +++ b/include/jemalloc/internal/peak_event.h @@ -4,6 +4,14 @@ #include "jemalloc/internal/jemalloc_preamble.h" #include "jemalloc/internal/tsd_types.h" +/* + * Update every 64K by default. We're not exposing this as a configuration + * option for now; we don't want to bind ourselves too tightly to any particular + * performance requirements for small values, or guarantee that we'll even be + * able to provide fine-grained accuracy. + */ +#define PEAK_EVENT_WAIT (64 * 1024) + /* * While peak.h contains the simple helper struct that tracks state, this * contains the allocator tie-ins (and knows about tsd, the event module, etc.). @@ -15,13 +23,6 @@ void peak_event_update(tsd_t *tsd); void peak_event_zero(tsd_t *tsd); uint64_t peak_event_max(tsd_t *tsd); -/* Manual hooks. */ -/* The activity-triggered hooks. */ -uint64_t peak_alloc_new_event_wait(tsd_t *tsd); -uint64_t peak_alloc_postponed_event_wait(tsd_t *tsd); -void peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed); -uint64_t peak_dalloc_new_event_wait(tsd_t *tsd); -uint64_t peak_dalloc_postponed_event_wait(tsd_t *tsd); -void peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); +extern te_base_cb_t peak_te_handler; #endif /* JEMALLOC_INTERNAL_PEAK_EVENT_H */ diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h index 789e3811..7d962522 100644 --- a/include/jemalloc/internal/prof_externs.h +++ b/include/jemalloc/internal/prof_externs.h @@ -5,6 +5,7 @@ #include "jemalloc/internal/base.h" #include "jemalloc/internal/mutex.h" #include "jemalloc/internal/prof_hook.h" +#include "jemalloc/internal/thread_event_registry.h" extern bool opt_prof; extern bool opt_prof_active; @@ -104,9 +105,65 @@ void prof_prefork1(tsdn_t *tsdn); void prof_postfork_parent(tsdn_t *tsdn); void prof_postfork_child(tsdn_t *tsdn); -/* Only accessed by thread event. */ uint64_t prof_sample_new_event_wait(tsd_t *tsd); -uint64_t prof_sample_postponed_event_wait(tsd_t *tsd); -void prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed); +uint64_t tsd_prof_sample_event_wait_get(tsd_t *tsd); + +/* + * The lookahead functionality facilitates events to be able to lookahead, i.e. + * without touching the event counters, to determine whether an event would be + * triggered. The event counters are not advanced until the end of the + * allocation / deallocation calls, so the lookahead can be useful if some + * preparation work for some event must be done early in the allocation / + * deallocation calls. + * + * Currently only the profiling sampling event needs the lookahead + * functionality, so we don't yet define general purpose lookahead functions. + * + * Surplus is a terminology referring to the amount of bytes beyond what's + * needed for triggering an event, which can be a useful quantity to have in + * general when lookahead is being called. + * + * This function returns true if allocation of usize would go above the next + * trigger for prof event, and false otherwise. + * If function returns true surplus will contain number of bytes beyond that + * trigger. + */ + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize, + size_t *surplus) { + if (surplus != NULL) { + /* + * This is a dead store: the surplus will be overwritten before + * any read. The initialization suppresses compiler warnings. + * Meanwhile, using SIZE_MAX to initialize is good for + * debugging purpose, because a valid surplus value is strictly + * less than usize, which is at most SIZE_MAX. + */ + *surplus = SIZE_MAX; + } + if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) { + return false; + } + /* The subtraction is intentionally susceptible to underflow. */ + uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize - + tsd_thread_allocated_last_event_get(tsd); + uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd); + if (accumbytes < sample_wait) { + return false; + } + assert(accumbytes - sample_wait < (uint64_t)usize); + if (surplus != NULL) { + *surplus = (size_t)(accumbytes - sample_wait); + } + return true; +} + +JEMALLOC_ALWAYS_INLINE bool +te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { + return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL); +} + +extern te_base_cb_t prof_sample_te_handler; #endif /* JEMALLOC_INTERNAL_PROF_EXTERNS_H */ diff --git a/include/jemalloc/internal/prof_threshold.h b/include/jemalloc/internal/prof_threshold.h index dc9c8f2b..93e9478e 100644 --- a/include/jemalloc/internal/prof_threshold.h +++ b/include/jemalloc/internal/prof_threshold.h @@ -3,9 +3,6 @@ #include "jemalloc/internal/tsd_types.h" -/* The activity-triggered hooks. */ -uint64_t prof_threshold_new_event_wait(tsd_t *tsd); -uint64_t prof_threshold_postponed_event_wait(tsd_t *tsd); -void prof_threshold_event_handler(tsd_t *tsd, uint64_t elapsed); +extern te_base_cb_t prof_threshold_te_handler; #endif /* JEMALLOC_INTERNAL_THRESHOLD_EVENT_H */ diff --git a/include/jemalloc/internal/stats.h b/include/jemalloc/internal/stats.h index 310178ea..a5f1be32 100644 --- a/include/jemalloc/internal/stats.h +++ b/include/jemalloc/internal/stats.h @@ -3,6 +3,7 @@ #include "jemalloc/internal/jemalloc_preamble.h" #include "jemalloc/internal/jemalloc_internal_types.h" +#include "jemalloc/internal/thread_event_registry.h" #include "jemalloc/internal/tsd_types.h" /* OPTION(opt, var_name, default, set_value_to) */ @@ -43,9 +44,7 @@ extern char opt_stats_interval_opts[stats_print_tot_num_options+1]; #define STATS_INTERVAL_ACCUM_BATCH_MAX (4 << 20) /* Only accessed by thread event. */ -uint64_t stats_interval_new_event_wait(tsd_t *tsd); -uint64_t stats_interval_postponed_event_wait(tsd_t *tsd); -void stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed); +extern te_base_cb_t stats_interval_te_handler; /* Implements je_malloc_stats_print. */ void stats_print(write_cb_t *write_cb, void *cbopaque, const char *opts); diff --git a/include/jemalloc/internal/tcache_externs.h b/include/jemalloc/internal/tcache_externs.h index 732adacb..024314fe 100644 --- a/include/jemalloc/internal/tcache_externs.h +++ b/include/jemalloc/internal/tcache_externs.h @@ -6,6 +6,7 @@ #include "jemalloc/internal/cache_bin.h" #include "jemalloc/internal/sz.h" #include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/thread_event_registry.h" extern bool opt_tcache; extern size_t opt_tcache_max; @@ -89,4 +90,6 @@ uint64_t tcache_gc_dalloc_new_event_wait(tsd_t *tsd); uint64_t tcache_gc_dalloc_postponed_event_wait(tsd_t *tsd); void tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed); +extern te_base_cb_t tcache_gc_te_handler; + #endif /* JEMALLOC_INTERNAL_TCACHE_EXTERNS_H */ diff --git a/include/jemalloc/internal/thread_event.h b/include/jemalloc/internal/thread_event.h index ad46ffe7..e9631cbd 100644 --- a/include/jemalloc/internal/thread_event.h +++ b/include/jemalloc/internal/thread_event.h @@ -49,29 +49,12 @@ void te_event_trigger(tsd_t *tsd, te_ctx_t *ctx); void te_recompute_fast_threshold(tsd_t *tsd); void tsd_te_init(tsd_t *tsd); -/* - * List of all events, in the following format: - * E(event, (condition), is_alloc_event) - */ -#define ITERATE_OVER_ALL_EVENTS \ - E(tcache_gc, (opt_tcache_gc_incr_bytes > 0), true) \ - E(prof_sample, (config_prof && opt_prof), true) \ - E(prof_threshold, config_stats, true) \ - E(stats_interval, (opt_stats_interval >= 0), true) \ - E(tcache_gc_dalloc, (opt_tcache_gc_incr_bytes > 0), false) \ - E(peak_alloc, config_stats, true) \ - E(peak_dalloc, config_stats, false) - -#define E(event, condition_unused, is_alloc_event_unused) \ - C(event##_event_wait) - /* List of all thread event counters. */ -#define ITERATE_OVER_ALL_COUNTERS \ - C(thread_allocated) \ - C(thread_allocated_last_event) \ - ITERATE_OVER_ALL_EVENTS \ - C(prof_sample_last_event) \ - C(stats_interval_last_event) +#define ITERATE_OVER_ALL_COUNTERS \ + C(thread_allocated) \ + C(thread_allocated_last_event) \ + C(prof_sample_last_event) \ + C(stats_interval_last_event) /* Getters directly wrap TSD getters. */ #define C(counter) \ @@ -99,12 +82,6 @@ counter##_set(tsd_t *tsd, uint64_t v) { \ ITERATE_OVER_ALL_COUNTERS #undef C -/* - * For generating _event_wait getter / setter functions for each individual - * event. - */ -#undef E - /* * The malloc and free fastpath getters -- use the unsafe getters since tsd may * be non-nominal, in which case the fast_threshold will be set to 0. This @@ -221,57 +198,6 @@ te_ctx_get(tsd_t *tsd, te_ctx_t *ctx, bool is_alloc) { } } -/* - * The lookahead functionality facilitates events to be able to lookahead, i.e. - * without touching the event counters, to determine whether an event would be - * triggered. The event counters are not advanced until the end of the - * allocation / deallocation calls, so the lookahead can be useful if some - * preparation work for some event must be done early in the allocation / - * deallocation calls. - * - * Currently only the profiling sampling event needs the lookahead - * functionality, so we don't yet define general purpose lookahead functions. - * - * Surplus is a terminology referring to the amount of bytes beyond what's - * needed for triggering an event, which can be a useful quantity to have in - * general when lookahead is being called. - */ - -JEMALLOC_ALWAYS_INLINE bool -te_prof_sample_event_lookahead_surplus(tsd_t *tsd, size_t usize, - size_t *surplus) { - if (surplus != NULL) { - /* - * This is a dead store: the surplus will be overwritten before - * any read. The initialization suppresses compiler warnings. - * Meanwhile, using SIZE_MAX to initialize is good for - * debugging purpose, because a valid surplus value is strictly - * less than usize, which is at most SIZE_MAX. - */ - *surplus = SIZE_MAX; - } - if (unlikely(!tsd_nominal(tsd) || tsd_reentrancy_level_get(tsd) > 0)) { - return false; - } - /* The subtraction is intentionally susceptible to underflow. */ - uint64_t accumbytes = tsd_thread_allocated_get(tsd) + usize - - tsd_thread_allocated_last_event_get(tsd); - uint64_t sample_wait = tsd_prof_sample_event_wait_get(tsd); - if (accumbytes < sample_wait) { - return false; - } - assert(accumbytes - sample_wait < (uint64_t)usize); - if (surplus != NULL) { - *surplus = (size_t)(accumbytes - sample_wait); - } - return true; -} - -JEMALLOC_ALWAYS_INLINE bool -te_prof_sample_event_lookahead(tsd_t *tsd, size_t usize) { - return te_prof_sample_event_lookahead_surplus(tsd, usize, NULL); -} - JEMALLOC_ALWAYS_INLINE void te_event_advance(tsd_t *tsd, size_t usize, bool is_alloc) { te_assert_invariants(tsd); diff --git a/include/jemalloc/internal/thread_event_registry.h b/include/jemalloc/internal/thread_event_registry.h new file mode 100644 index 00000000..aee7a4f2 --- /dev/null +++ b/include/jemalloc/internal/thread_event_registry.h @@ -0,0 +1,58 @@ +#ifndef JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H +#define JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H + +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/tsd.h" + +/* "te" is short for "thread_event" */ +enum te_alloc_e { +#ifdef JEMALLOC_PROF + te_alloc_prof_sample, +#endif + te_alloc_stats_interval, +#ifdef JEMALLOC_STATS + te_alloc_prof_threshold, +#endif + te_alloc_tcache_gc, +#ifdef JEMALLOC_STATS + te_alloc_peak, + te_alloc_last = te_alloc_peak, +#else + te_alloc_last = te_alloc_tcache_gc, +#endif + te_alloc_count = te_alloc_last + 1 +}; +typedef enum te_alloc_e te_alloc_t; + +enum te_dalloc_e { + te_dalloc_tcache_gc, +#ifdef JEMALLOC_STATS + te_dalloc_peak, + te_dalloc_last = te_dalloc_peak, +#else + te_dalloc_last = te_dalloc_tcache_gc, +#endif + te_dalloc_count = te_dalloc_last + 1 +}; +typedef enum te_dalloc_e te_dalloc_t; + +/* These will live in tsd */ +typedef struct te_data_s te_data_t; +struct te_data_s { + uint64_t alloc_wait[te_alloc_count]; + uint64_t dalloc_wait[te_dalloc_count]; +}; +#define TE_DATA_INITIALIZER { {0}, {0} } + +typedef struct te_base_cb_s te_base_cb_t; +struct te_base_cb_s { + bool (*enabled)(void); + uint64_t (*new_event_wait)(tsd_t *tsd); + uint64_t (*postponed_event_wait)(tsd_t *tsd); + void (*event_handler)(tsd_t *tsd); +}; + +extern te_base_cb_t *te_alloc_handlers[te_alloc_count]; +extern te_base_cb_t *te_dalloc_handlers[te_dalloc_count]; + +#endif /* JEMALLOC_INTERNAL_THREAD_EVENT_REGISTRY_H */ diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h index 0ed33234..69b60519 100644 --- a/include/jemalloc/internal/tsd_internals.h +++ b/include/jemalloc/internal/tsd_internals.h @@ -15,6 +15,7 @@ #include "jemalloc/internal/rtree_tsd.h" #include "jemalloc/internal/tcache_structs.h" #include "jemalloc/internal/tcache_types.h" +#include "jemalloc/internal/thread_event_registry.h" #include "jemalloc/internal/tsd_types.h" #include "jemalloc/internal/util.h" #include "jemalloc/internal/witness.h" @@ -68,15 +69,9 @@ typedef ql_elm(tsd_t) tsd_link_t; O(thread_allocated_next_event, uint64_t, uint64_t) \ O(thread_deallocated_last_event, uint64_t, uint64_t) \ O(thread_deallocated_next_event, uint64_t, uint64_t) \ - O(tcache_gc_event_wait, uint64_t, uint64_t) \ - O(tcache_gc_dalloc_event_wait, uint64_t, uint64_t) \ - O(prof_sample_event_wait, uint64_t, uint64_t) \ + O(te_data, te_data_t, te_data_t) \ O(prof_sample_last_event, uint64_t, uint64_t) \ - O(prof_threshold_event_wait, uint64_t, uint64_t) \ - O(stats_interval_event_wait, uint64_t, uint64_t) \ - O(stats_interval_last_event, uint64_t, uint64_t) \ - O(peak_alloc_event_wait, uint64_t, uint64_t) \ - O(peak_dalloc_event_wait, uint64_t, uint64_t) \ + O(stats_interval_last_event, uint64_t, uint64_t) \ O(prof_tdata, prof_tdata_t *, prof_tdata_t *) \ O(prng_state, uint64_t, uint64_t) \ O(san_extents_until_guard_small, uint64_t, uint64_t) \ @@ -102,15 +97,9 @@ typedef ql_elm(tsd_t) tsd_link_t; /* thread_allocated_next_event */ 0, \ /* thread_deallocated_last_event */ 0, \ /* thread_deallocated_next_event */ 0, \ - /* tcache_gc_event_wait */ 0, \ - /* tcache_gc_dalloc_event_wait */ 0, \ - /* prof_sample_event_wait */ 0, \ + /* te_data */ TE_DATA_INITIALIZER, \ /* prof_sample_last_event */ 0, \ - /* prof_threshold_event_wait */ 0, \ - /* stats_interval_event_wait */ 0, \ /* stats_interval_last_event */ 0, \ - /* peak_alloc_event_wait */ 0, \ - /* peak_dalloc_event_wait */ 0, \ /* prof_tdata */ NULL, \ /* prng_state */ 0, \ /* san_extents_until_guard_small */ 0, \ diff --git a/src/peak_event.c b/src/peak_event.c index 4093fbcc..430bfdea 100644 --- a/src/peak_event.c +++ b/src/peak_event.c @@ -5,14 +5,7 @@ #include "jemalloc/internal/activity_callback.h" #include "jemalloc/internal/peak.h" - -/* - * Update every 64K by default. We're not exposing this as a configuration - * option for now; we don't want to bind ourselves too tightly to any particular - * performance requirements for small values, or guarantee that we'll even be - * able to provide fine-grained accuracy. - */ -#define PEAK_EVENT_WAIT (64 * 1024) +#include "jemalloc/internal/thread_event_registry.h" /* Update the peak with current tsd state. */ void @@ -49,34 +42,31 @@ peak_event_max(tsd_t *tsd) { return peak_max(peak); } -uint64_t -peak_alloc_new_event_wait(tsd_t *tsd) { +static uint64_t +peak_event_new_event_wait(tsd_t *tsd) { return PEAK_EVENT_WAIT; } -uint64_t -peak_alloc_postponed_event_wait(tsd_t *tsd) { +static uint64_t +peak_event_postponed_event_wait(tsd_t *tsd) { return TE_MIN_START_WAIT; } -void -peak_alloc_event_handler(tsd_t *tsd, uint64_t elapsed) { +static void +peak_event_handler(tsd_t *tsd) { peak_event_update(tsd); peak_event_activity_callback(tsd); } -uint64_t -peak_dalloc_new_event_wait(tsd_t *tsd) { - return PEAK_EVENT_WAIT; +static bool +peak_event_enabled(void) { + return config_stats; } -uint64_t -peak_dalloc_postponed_event_wait(tsd_t *tsd) { - return TE_MIN_START_WAIT; -} - -void -peak_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { - peak_event_update(tsd); - peak_event_activity_callback(tsd); -} +/* Handles alloc and dalloc */ +te_base_cb_t peak_te_handler = { + .enabled = &peak_event_enabled, + .new_event_wait = &peak_event_new_event_wait, + .postponed_event_wait = &peak_event_postponed_event_wait, + .event_handler = &peak_event_handler, +}; diff --git a/src/prof.c b/src/prof.c index 8fdc6f71..94eddb6d 100644 --- a/src/prof.c +++ b/src/prof.c @@ -12,6 +12,7 @@ #include "jemalloc/internal/prof_sys.h" #include "jemalloc/internal/prof_hook.h" #include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/thread_event_registry.h" /* * This file implements the profiling "APIs" needed by other parts of jemalloc, @@ -289,8 +290,40 @@ prof_sample_new_event_wait(tsd_t *tsd) { #endif } +void +prof_sample_event_handler(tsd_t *tsd) { + cassert(config_prof); + if (prof_interval == 0 || !prof_active_get_unlocked()) { + return; + } + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_sample_event = tsd_prof_sample_last_event_get(tsd); + tsd_prof_sample_last_event_set(tsd, last_event); + uint64_t elapsed = last_event - last_sample_event; + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); + if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) { + prof_idump(tsd_tsdn(tsd)); + } +} + +static bool +prof_sample_enabled(void) { + return config_prof && opt_prof; +} + uint64_t -prof_sample_postponed_event_wait(tsd_t *tsd) { +tsd_prof_sample_event_wait_get(tsd_t *tsd) { +#ifdef JEMALLOC_PROF + return tsd_te_datap_get_unsafe(tsd)->alloc_wait[te_alloc_prof_sample]; +#else + not_reached(); + return TE_MAX_START_WAIT; +#endif +} + +te_base_cb_t prof_sample_te_handler = { + .enabled = &prof_sample_enabled, + .new_event_wait = &prof_sample_new_event_wait, /* * The postponed wait time for prof sample event is computed as if we * want a new wait time (i.e. as if the event were triggered). If we @@ -298,21 +331,10 @@ prof_sample_postponed_event_wait(tsd_t *tsd) { * handling the other events, then we can have sampling bias, if e.g. * the allocation immediately following a reentrancy always comes from * the same stack trace. - */ - return prof_sample_new_event_wait(tsd); -} - -void -prof_sample_event_handler(tsd_t *tsd, uint64_t elapsed) { - cassert(config_prof); - assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); - if (prof_interval == 0 || !prof_active_get_unlocked()) { - return; - } - if (counter_accum(tsd_tsdn(tsd), &prof_idump_accumulated, elapsed)) { - prof_idump(tsd_tsdn(tsd)); - } -} + */ + .postponed_event_wait = &prof_sample_new_event_wait, + .event_handler = &prof_sample_event_handler, +}; static void prof_fdump(void) { diff --git a/src/prof_threshold.c b/src/prof_threshold.c index 28a525fc..516b0bf6 100644 --- a/src/prof_threshold.c +++ b/src/prof_threshold.c @@ -52,6 +52,18 @@ prof_threshold_postponed_event_wait(tsd_t *tsd) { } void -prof_threshold_event_handler(tsd_t *tsd, uint64_t elapsed) { +prof_threshold_event_handler(tsd_t *tsd) { prof_threshold_update(tsd); } + +static bool +prof_threshold_enabled(void) { + return config_stats; +} + +te_base_cb_t prof_threshold_te_handler = { + .enabled = &prof_threshold_enabled, + .new_event_wait = &prof_threshold_new_event_wait, + .postponed_event_wait = &prof_threshold_postponed_event_wait, + .event_handler = &prof_threshold_event_handler, +}; diff --git a/src/stats.c b/src/stats.c index 8496e457..efc73223 100644 --- a/src/stats.c +++ b/src/stats.c @@ -65,7 +65,7 @@ char opt_stats_interval_opts[stats_print_tot_num_options+1] = ""; static counter_accum_t stats_interval_accumulated; /* Per thread batch accum size for stats_interval. */ -static uint64_t stats_interval_accum_batch; +uint64_t stats_interval_accum_batch; /******************************************************************************/ @@ -2128,7 +2128,12 @@ stats_interval_postponed_event_wait(tsd_t *tsd) { } void -stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) { +stats_interval_event_handler(tsd_t *tsd) { + uint64_t last_event = thread_allocated_last_event_get(tsd); + uint64_t last_sample_event = tsd_stats_interval_last_event_get(tsd); + tsd_stats_interval_last_event_set(tsd, last_event); + uint64_t elapsed = last_event - last_sample_event; + assert(elapsed > 0 && elapsed != TE_INVALID_ELAPSED); if (counter_accum(tsd_tsdn(tsd), &stats_interval_accumulated, elapsed)) { @@ -2136,6 +2141,19 @@ stats_interval_event_handler(tsd_t *tsd, uint64_t elapsed) { } } +static bool +stats_interval_enabled(void) { + return opt_stats_interval >= 0; +} + +te_base_cb_t stats_interval_te_handler = { + .enabled = &stats_interval_enabled, + .new_event_wait = &stats_interval_new_event_wait, + .postponed_event_wait = &stats_interval_postponed_event_wait, + .event_handler = &stats_interval_event_handler, +}; + + bool stats_boot(void) { uint64_t stats_interval; diff --git a/src/tcache.c b/src/tcache.c index 270d38ac..36af7d97 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -511,7 +511,7 @@ tcache_try_gc_bin(tsd_t *tsd, tcache_slow_t *tcache_slow, tcache_t *tcache, } static void -tcache_event(tsd_t *tsd) { +tcache_gc_event(tsd_t *tsd) { tcache_t *tcache = tcache_get(tsd); if (tcache == NULL) { return; @@ -581,18 +581,6 @@ tcache_event(tsd_t *tsd) { tcache_slow->next_gc_bin_large = szind_large; } -void -tcache_gc_event_handler(tsd_t *tsd, uint64_t elapsed) { - assert(elapsed == TE_INVALID_ELAPSED); - tcache_event(tsd); -} - -void -tcache_gc_dalloc_event_handler(tsd_t *tsd, uint64_t elapsed) { - assert(elapsed == TE_INVALID_ELAPSED); - tcache_event(tsd); -} - void * tcache_alloc_small_hard(tsdn_t *tsdn, arena_t *arena, tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, @@ -1912,3 +1900,16 @@ tcache_postfork_child(tsdn_t *tsdn) { void tcache_assert_initialized(tcache_t *tcache) { assert(!cache_bin_still_zero_initialized(&tcache->bins[0])); } + +static bool +tcache_gc_enabled(void) { + return (opt_tcache_gc_incr_bytes > 0); +} + +/* Handles alloc and dalloc the same way */ +te_base_cb_t tcache_gc_te_handler = { + .enabled = &tcache_gc_enabled, + .new_event_wait = &tcache_gc_new_event_wait, + .postponed_event_wait = &tcache_gc_postponed_event_wait, + .event_handler = &tcache_gc_event, +}; diff --git a/src/thread_event.c b/src/thread_event.c index a8276cd7..0b1adcc1 100644 --- a/src/thread_event.c +++ b/src/thread_event.c @@ -2,108 +2,46 @@ #include "jemalloc/internal/jemalloc_internal_includes.h" #include "jemalloc/internal/thread_event.h" - -/* - * Signatures for event specific functions. These functions should be defined - * by the modules owning each event. The signatures here verify that the - * definitions follow the right format. - * - * The first two are functions computing new / postponed event wait time. New - * event wait time is the time till the next event if an event is currently - * being triggered; postponed event wait time is the time till the next event - * if an event should be triggered but needs to be postponed, e.g. when the TSD - * is not nominal or during reentrancy. - * - * The third is the event handler function, which is called whenever an event - * is triggered. The parameter is the elapsed time since the last time an - * event of the same type was triggered. - */ -#define E(event, condition_unused, is_alloc_event_unused) \ -uint64_t event##_new_event_wait(tsd_t *tsd); \ -uint64_t event##_postponed_event_wait(tsd_t *tsd); \ -void event##_event_handler(tsd_t *tsd, uint64_t elapsed); - -ITERATE_OVER_ALL_EVENTS -#undef E - -/* Signatures for internal functions fetching elapsed time. */ -#define E(event, condition_unused, is_alloc_event_unused) \ -static uint64_t event##_fetch_elapsed(tsd_t *tsd); - -ITERATE_OVER_ALL_EVENTS -#undef E - -static uint64_t -tcache_gc_fetch_elapsed(tsd_t *tsd) { - return TE_INVALID_ELAPSED; -} - -static uint64_t -tcache_gc_dalloc_fetch_elapsed(tsd_t *tsd) { - return TE_INVALID_ELAPSED; -} - -static uint64_t -prof_sample_fetch_elapsed(tsd_t *tsd) { - uint64_t last_event = thread_allocated_last_event_get(tsd); - uint64_t last_sample_event = prof_sample_last_event_get(tsd); - prof_sample_last_event_set(tsd, last_event); - return last_event - last_sample_event; -} - -static uint64_t -stats_interval_fetch_elapsed(tsd_t *tsd) { - uint64_t last_event = thread_allocated_last_event_get(tsd); - uint64_t last_stats_event = stats_interval_last_event_get(tsd); - stats_interval_last_event_set(tsd, last_event); - return last_event - last_stats_event; -} - -static uint64_t -peak_alloc_fetch_elapsed(tsd_t *tsd) { - return TE_INVALID_ELAPSED; -} - -static uint64_t -peak_dalloc_fetch_elapsed(tsd_t *tsd) { - return TE_INVALID_ELAPSED; -} - -static uint64_t -prof_threshold_fetch_elapsed(tsd_t *tsd) { - return TE_INVALID_ELAPSED; -} - -/* Per event facilities done. */ +#include "jemalloc/internal/thread_event_registry.h" +#include "jemalloc/internal/peak_event.h" static bool te_ctx_has_active_events(te_ctx_t *ctx) { assert(config_debug); -#define E(event, condition, alloc_event) \ - if (condition && alloc_event == ctx->is_alloc) { \ - return true; \ + if (ctx->is_alloc) { + for (int i = 0; i < te_alloc_count; ++i) { + if (te_alloc_handlers[i]->enabled()) { + return true; + } + } + } else { + for (int i = 0; i < te_dalloc_count; ++i) { + if (te_dalloc_handlers[i]->enabled()) { + return true; + } + } } - ITERATE_OVER_ALL_EVENTS -#undef E return false; } static uint64_t te_next_event_compute(tsd_t *tsd, bool is_alloc) { + te_base_cb_t **handlers = is_alloc ? te_alloc_handlers : te_dalloc_handlers; + uint64_t *waits = is_alloc ? tsd_te_datap_get_unsafe(tsd)->alloc_wait : tsd_te_datap_get_unsafe(tsd)->dalloc_wait; + int count = is_alloc ? te_alloc_count : te_dalloc_count; + uint64_t wait = TE_MAX_START_WAIT; -#define E(event, condition, alloc_event) \ - if (is_alloc == alloc_event && condition) { \ - uint64_t event_wait = \ - event##_event_wait_get(tsd); \ - assert(event_wait <= TE_MAX_START_WAIT); \ - if (event_wait > 0U && event_wait < wait) { \ - wait = event_wait; \ - } \ + + for (int i = 0; i < count; i++) { + if (handlers[i]->enabled()) { + uint64_t ev_wait = waits[i]; + assert(ev_wait <= TE_MAX_START_WAIT); + if (ev_wait > 0U && ev_wait < wait) { + wait = ev_wait; + } + } } - ITERATE_OVER_ALL_EVENTS -#undef E - assert(wait <= TE_MAX_START_WAIT); return wait; } @@ -238,18 +176,132 @@ te_adjust_thresholds_helper(tsd_t *tsd, te_ctx_t *ctx, te_ctx_next_event_set(tsd, ctx, next_event); } -static uint64_t -te_clip_event_wait(uint64_t event_wait) { - assert(event_wait > 0U); - if (TE_MIN_START_WAIT > 1U && - unlikely(event_wait < TE_MIN_START_WAIT)) { - event_wait = TE_MIN_START_WAIT; +static void +te_init_waits(tsd_t *tsd, uint64_t *wait, bool is_alloc) { + te_base_cb_t **handlers = is_alloc ? te_alloc_handlers : te_dalloc_handlers; + uint64_t *waits = is_alloc ? tsd_te_datap_get_unsafe(tsd)->alloc_wait : tsd_te_datap_get_unsafe(tsd)->dalloc_wait; + int count = is_alloc ? te_alloc_count : te_dalloc_count; + for (int i = 0; i < count; i++) { + if (handlers[i]->enabled()) { + uint64_t ev_wait = handlers[i]->new_event_wait(tsd); + assert(ev_wait > 0); + waits[i] = ev_wait; + if (ev_wait < *wait) { + *wait = ev_wait; + } + } } - if (TE_MAX_START_WAIT < UINT64_MAX && - unlikely(event_wait > TE_MAX_START_WAIT)) { - event_wait = TE_MAX_START_WAIT; +} + +static inline bool +te_update_wait(tsd_t *tsd, uint64_t accumbytes, bool allow, + uint64_t *ev_wait, uint64_t *wait, te_base_cb_t *handler, + uint64_t new_wait) { + bool ret = false; + if (*ev_wait > accumbytes) { + *ev_wait -= accumbytes; + } else if (!allow) { + *ev_wait = handler->postponed_event_wait(tsd); + } else { + ret = true; + *ev_wait = new_wait == 0 ? + handler->new_event_wait(tsd) : + new_wait; + } + + assert(*ev_wait > 0); + if (*ev_wait < *wait) { + *wait = *ev_wait; + } + return ret; +} + +extern uint64_t stats_interval_accum_batch; +/* Return number of handlers enqueued into to_trigger array */ +static inline size_t +te_update_alloc_events(tsd_t *tsd, te_base_cb_t **to_trigger, + uint64_t accumbytes, bool allow, uint64_t *wait) { + /* + * We do not loop and invoke the functions via interface because + * of the perf cost. This path is relatively hot, so we sacrifice + * elegance for perf. + */ + size_t nto_trigger = 0; + uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->alloc_wait; + if (opt_tcache_gc_incr_bytes > 0) { + assert(te_alloc_handlers[te_alloc_tcache_gc]->enabled()); + if (te_update_wait(tsd, accumbytes, allow, + &waits[te_alloc_tcache_gc], wait, + te_alloc_handlers[te_alloc_tcache_gc], + opt_tcache_gc_incr_bytes)) { + to_trigger[nto_trigger++] = + te_alloc_handlers[te_alloc_tcache_gc]; + } } - return event_wait; +#ifdef JEMALLOC_PROF + if (opt_prof) { + assert(te_alloc_handlers[te_alloc_prof_sample]->enabled()); + if(te_update_wait(tsd, accumbytes, allow, + &waits[te_alloc_prof_sample], wait, + te_alloc_handlers[te_alloc_prof_sample], 0)) { + to_trigger[nto_trigger++] = + te_alloc_handlers[te_alloc_prof_sample]; + } + } +#endif + if (opt_stats_interval >= 0) { + if (te_update_wait(tsd, accumbytes, allow, + &waits[te_alloc_stats_interval], + wait, + te_alloc_handlers[te_alloc_stats_interval], + stats_interval_accum_batch)) { + assert(te_alloc_handlers[te_alloc_stats_interval]->enabled()); + to_trigger[nto_trigger++] = + te_alloc_handlers[te_alloc_stats_interval]; + } + } + +#ifdef JEMALLOC_STATS + assert(te_alloc_handlers[te_alloc_peak]->enabled()); + if(te_update_wait(tsd, accumbytes, allow, &waits[te_alloc_peak], wait, + te_alloc_handlers[te_alloc_peak], PEAK_EVENT_WAIT)) { + to_trigger[nto_trigger++] = te_alloc_handlers[te_alloc_peak]; + } + + assert(te_alloc_handlers[te_alloc_prof_threshold]->enabled()); + if(te_update_wait(tsd, accumbytes, allow, + &waits[te_alloc_prof_threshold], wait, + te_alloc_handlers[te_alloc_prof_threshold], + 1 << opt_experimental_lg_prof_threshold)) { + to_trigger[nto_trigger++] = te_alloc_handlers[te_alloc_prof_threshold]; + } +#endif + return nto_trigger; +} + +static inline size_t +te_update_dalloc_events(tsd_t *tsd, te_base_cb_t **to_trigger, uint64_t accumbytes, + bool allow, uint64_t *wait) { + size_t nto_trigger = 0; + uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->dalloc_wait; + if (opt_tcache_gc_incr_bytes > 0) { + assert(te_dalloc_handlers[te_dalloc_tcache_gc]->enabled()); + if (te_update_wait(tsd, accumbytes, allow, + &waits[te_dalloc_tcache_gc], wait, + te_dalloc_handlers[te_dalloc_tcache_gc], + opt_tcache_gc_incr_bytes)) { + to_trigger[nto_trigger++] = + te_dalloc_handlers[te_dalloc_tcache_gc]; + } + } +#ifdef JEMALLOC_STATS + assert(te_dalloc_handlers[te_dalloc_peak]->enabled()); + if(te_update_wait(tsd, accumbytes, allow, &waits[te_dalloc_peak], wait, + te_dalloc_handlers[te_dalloc_peak], PEAK_EVENT_WAIT)) { + to_trigger[nto_trigger++] = te_dalloc_handlers[te_dalloc_peak]; + } +#endif + return nto_trigger; } void @@ -263,47 +315,32 @@ te_event_trigger(tsd_t *tsd, te_ctx_t *ctx) { bool allow_event_trigger = tsd_nominal(tsd) && tsd_reentrancy_level_get(tsd) == 0; - bool is_alloc = ctx->is_alloc; uint64_t wait = TE_MAX_START_WAIT; -#define E(event, condition, alloc_event) \ - bool is_##event##_triggered = false; \ - if (is_alloc == alloc_event && condition) { \ - uint64_t event_wait = event##_event_wait_get(tsd); \ - assert(event_wait <= TE_MAX_START_WAIT); \ - if (event_wait > accumbytes) { \ - event_wait -= accumbytes; \ - } else if (!allow_event_trigger) { \ - event_wait = event##_postponed_event_wait(tsd); \ - } else { \ - is_##event##_triggered = true; \ - event_wait = event##_new_event_wait(tsd); \ - } \ - event_wait = te_clip_event_wait(event_wait); \ - event##_event_wait_set(tsd, event_wait); \ - if (event_wait < wait) { \ - wait = event_wait; \ - } \ + assert((int)te_alloc_count >= (int) te_dalloc_count); + te_base_cb_t *to_trigger[te_alloc_count]; + size_t nto_trigger; + if (ctx->is_alloc) { + nto_trigger = te_update_alloc_events(tsd, to_trigger, + accumbytes, + allow_event_trigger, + &wait); + } else { + nto_trigger = te_update_dalloc_events(tsd, to_trigger, + accumbytes, + allow_event_trigger, + &wait); } - ITERATE_OVER_ALL_EVENTS -#undef E - - assert(wait <= TE_MAX_START_WAIT); + assert(wait <= TE_MAX_START_WAIT); te_adjust_thresholds_helper(tsd, ctx, wait); te_assert_invariants(tsd); -#define E(event, condition, alloc_event) \ - if (is_alloc == alloc_event && condition && \ - is_##event##_triggered) { \ - assert(allow_event_trigger); \ - uint64_t elapsed = event##_fetch_elapsed(tsd); \ - event##_event_handler(tsd, elapsed); \ + for (size_t i = 0; i < nto_trigger; i++) { + assert(allow_event_trigger); + to_trigger[i]->event_handler(tsd); } - ITERATE_OVER_ALL_EVENTS -#undef E - te_assert_invariants(tsd); } @@ -323,18 +360,8 @@ te_init(tsd_t *tsd, bool is_alloc) { te_ctx_last_event_set(&ctx, te_ctx_current_bytes_get(&ctx)); uint64_t wait = TE_MAX_START_WAIT; -#define E(event, condition, alloc_event) \ - if (is_alloc == alloc_event && condition) { \ - uint64_t event_wait = event##_new_event_wait(tsd); \ - event_wait = te_clip_event_wait(event_wait); \ - event##_event_wait_set(tsd, event_wait); \ - if (event_wait < wait) { \ - wait = event_wait; \ - } \ - } + te_init_waits(tsd, &wait, is_alloc); - ITERATE_OVER_ALL_EVENTS -#undef E te_adjust_thresholds_helper(tsd, &ctx, wait); } diff --git a/src/thread_event_registry.c b/src/thread_event_registry.c new file mode 100644 index 00000000..7543cfda --- /dev/null +++ b/src/thread_event_registry.c @@ -0,0 +1,37 @@ +#include "jemalloc/internal/jemalloc_preamble.h" +#include "jemalloc/internal/jemalloc_internal_includes.h" + +#include "jemalloc/internal/thread_event.h" +#include "jemalloc/internal/thread_event_registry.h" +#include "jemalloc/internal/thread_event_registry.h" +#include "jemalloc/internal/tcache_externs.h" +#include "jemalloc/internal/peak_event.h" +#include "jemalloc/internal/prof_externs.h" +#include "jemalloc/internal/prof_threshold.h" +#include "jemalloc/internal/stats.h" + + +/* Table of all the thread events. + * Events share interface, but internally they will know thier + * data layout in tsd. + */ +te_base_cb_t *te_alloc_handlers[te_alloc_count] = { +#ifdef JEMALLOC_PROF + &prof_sample_te_handler, +#endif + &stats_interval_te_handler, +#ifdef JEMALLOC_STATS + &prof_threshold_te_handler, +#endif + &tcache_gc_te_handler, +#ifdef JEMALLOC_STATS + &peak_te_handler, +#endif +}; + +te_base_cb_t *te_dalloc_handlers[te_dalloc_count] = { + &tcache_gc_te_handler, +#ifdef JEMALLOC_STATS + &peak_te_handler, +#endif +}; diff --git a/test/unit/thread_event.c b/test/unit/thread_event.c index e0b88a92..8b4fb1d6 100644 --- a/test/unit/thread_event.c +++ b/test/unit/thread_event.c @@ -8,12 +8,11 @@ TEST_BEGIN(test_next_event_fast) { te_ctx_last_event_set(&ctx, 0); te_ctx_current_bytes_set(&ctx, TE_NEXT_EVENT_FAST_MAX - 8U); te_ctx_next_event_set(tsd, &ctx, TE_NEXT_EVENT_FAST_MAX); -#define E(event, condition, is_alloc) \ - if (is_alloc && condition) { \ - event##_event_wait_set(tsd, TE_NEXT_EVENT_FAST_MAX); \ + + uint64_t *waits = tsd_te_datap_get_unsafe(tsd)->alloc_wait; + for (size_t i = 0; i < te_alloc_count; i++) { + waits[i] = TE_NEXT_EVENT_FAST_MAX; } - ITERATE_OVER_ALL_EVENTS -#undef E /* Test next_event_fast rolling back to 0. */ void *p = malloc(16U);