diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h index 84101c65..91369dde 100644 --- a/include/jemalloc/internal/tsd.h +++ b/include/jemalloc/internal/tsd.h @@ -37,13 +37,12 @@ TSD_DATA_SLOWER JEMALLOC_ALWAYS_INLINE t * \ tsd_##n##p_get(tsd_t *tsd) { \ /* \ - * Because the state might change asynchronously if it's \ - * nominal, we need to make sure that we only read it once. \ + * Read the state once, so that a transition between \ + * nominal states does not confuse this assertion. \ */ \ uint8_t state = tsd_state_get(tsd); \ assert(state == tsd_state_nominal || \ state == tsd_state_nominal_slow || \ - state == tsd_state_nominal_recompute || \ state == tsd_state_reincarnated || \ state == tsd_state_minimal_initialized); \ return tsd_##n##p_get_unsafe(tsd); \ @@ -95,11 +94,6 @@ TSD_DATA_SLOWER JEMALLOC_ALWAYS_INLINE void tsd_assert_fast(tsd_t *tsd) { - /* - * Note that our fastness assertion does *not* include global slowness - * counters; it's not in general possible to ensure that they won't - * change asynchronously from underneath us. - */ assert(!malloc_slow && tsd_tcache_enabled_get(tsd) && tsd_reentrancy_level_get(tsd) == 0); } diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h index 062e2974..0e03c318 100644 --- a/include/jemalloc/internal/tsd_internals.h +++ b/include/jemalloc/internal/tsd_internals.h @@ -157,39 +157,29 @@ enum { tsd_state_nominal = 0, /* Initialized but on slow path. */ tsd_state_nominal_slow = 1, - /* - * Some thread has changed global state in such a way that all nominal - * threads need to recompute their fast / slow status the next time they - * get a chance. - * - * Any thread can change another thread's status *to* recompute, but - * threads are the only ones who can change their status *from* - * recompute. - */ - tsd_state_nominal_recompute = 2, /* * The above nominal states should be lower values. We use * tsd_nominal_max to separate nominal states from threads in the * process of being born / dying. */ - tsd_state_nominal_max = 2, + tsd_state_nominal_max = 1, /* * A thread might free() during its death as its only allocator action; * in such scenarios, we need tsd, but set up in such a way that no * cleanup is necessary. */ - tsd_state_minimal_initialized = 3, + tsd_state_minimal_initialized = 2, /* States during which we know we're in thread death. */ - tsd_state_purgatory = 4, - tsd_state_reincarnated = 5, + tsd_state_purgatory = 3, + tsd_state_reincarnated = 4, /* * What it says on the tin; tsd that hasn't been initialized. Note * that even when the tsd struct lives in TLS, when need to keep track * of stuff like whether or not our pthread destructors have been * scheduled, so this really truly is different than the nominal state. */ - tsd_state_uninitialized = 6 + tsd_state_uninitialized = 5 }; /* @@ -203,12 +193,10 @@ enum { # define tsd_state_t atomic_u8_t # define tsd_atomic_load atomic_load_u8 # define tsd_atomic_store atomic_store_u8 -# define tsd_atomic_exchange atomic_exchange_u8 #else # define tsd_state_t atomic_u32_t # define tsd_atomic_load atomic_load_u32 # define tsd_atomic_store atomic_store_u32 -# define tsd_atomic_exchange atomic_exchange_u32 #endif /* The actual tsd. */ diff --git a/src/thread_event.c b/src/thread_event.c index 82776342..a8c5e2e1 100644 --- a/src/thread_event.c +++ b/src/thread_event.c @@ -101,53 +101,6 @@ te_assert_invariants_debug(tsd_t *tsd) { te_assert_invariants_impl(tsd, &ctx); } -/* - * Synchronization around the fast threshold in tsd -- - * There are two threads to consider in the synchronization here: - * - The owner of the tsd being updated by a slow path change - * - The remote thread, doing that slow path change. - * - * As a design constraint, we want to ensure that a slow-path transition cannot - * be ignored for arbitrarily long, and that if the remote thread causes a - * slow-path transition and then communicates with the owner thread that it has - * occurred, then the owner will go down the slow path on the next allocator - * operation (so that we don't want to just wait until the owner hits its slow - * path reset condition on its own). - * - * Here's our strategy to do that: - * - * The remote thread will update the slow-path stores to TSD variables, issue a - * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner - * thread will update next_event_fast, issue an SEQ_CST fence, and then check - * its TSD to see if it's on the slow path. - - * This is fairly straightforward when 64-bit atomics are supported. Assume that - * the remote fence is sandwiched between two owner fences in the reset pathway. - * The case where there is no preceding or trailing owner fence (i.e. because - * the owner thread is near the beginning or end of its life) can be analyzed - * similarly. The owner store to next_event_fast preceding the earlier owner - * fence will be earlier in coherence order than the remote store to it, so that - * the owner thread will go down the slow path once the store becomes visible to - * it, which is no later than the time of the second fence. - - * The case where we don't support 64-bit atomics is trickier, since word - * tearing is possible. We'll repeat the same analysis, and look at the two - * owner fences sandwiching the remote fence. The next_event_fast stores done - * alongside the earlier owner fence cannot overwrite any of the remote stores - * (since they precede the earlier owner fence in sb, which precedes the remote - * fence in sc, which precedes the remote stores in sb). After the second owner - * fence there will be a re-check of the slow-path variables anyways, so the - * "owner will notice that it's on the slow path eventually" guarantee is - * satisfied. To make sure that the out-of-band-messaging constraint is as well, - * note that either the message passing is sequenced before the second owner - * fence (in which case the remote stores happen before the second set of owner - * stores, so malloc sees a value of zero for next_event_fast and goes down the - * slow path), or it is not (in which case the owner sees the tsd slow-path - * writes on its previous update). This leaves open the possibility that the - * remote thread will (at some arbitrary point in the future) zero out one half - * of the owner thread's next_event_fast, but that's always safe (it just sends - * it down the slow path earlier). - */ static void te_ctx_next_event_fast_update(te_ctx_t *ctx) { uint64_t next_event = te_ctx_next_event_get(ctx); @@ -160,7 +113,6 @@ te_ctx_next_event_fast_update(te_ctx_t *ctx) { void te_recompute_fast_threshold(tsd_t *tsd) { if (tsd_state_get(tsd) != tsd_state_nominal) { - /* Check first because this is also called on purgatory. */ te_next_event_fast_set_non_nominal(tsd); return; } @@ -170,11 +122,6 @@ te_recompute_fast_threshold(tsd_t *tsd) { te_ctx_next_event_fast_update(&ctx); te_ctx_get(tsd, &ctx, false); te_ctx_next_event_fast_update(&ctx); - - atomic_fence(ATOMIC_SEQ_CST); - if (tsd_state_get(tsd) != tsd_state_nominal) { - te_next_event_fast_set_non_nominal(tsd); - } } static inline void diff --git a/src/tsd.c b/src/tsd.c index cb6edfdc..97c2065e 100644 --- a/src/tsd.c +++ b/src/tsd.c @@ -124,20 +124,17 @@ tsd_state_compute(tsd_t *tsd) { void tsd_slow_update(tsd_t *tsd) { - uint8_t old_state; - do { - uint8_t new_state = tsd_state_compute(tsd); - old_state = tsd_atomic_exchange( - &tsd->state, new_state, ATOMIC_ACQUIRE); - } while (old_state == tsd_state_nominal_recompute); + assert(!tsd_booted_get() || tsd_get(false) == tsd + || (tsd_get_allocates() && tsd_get(false) == NULL)); + tsd_atomic_store(&tsd->state, tsd_state_compute(tsd), ATOMIC_RELAXED); te_recompute_fast_threshold(tsd); } void tsd_state_set(tsd_t *tsd, uint8_t new_state) { - /* Only the tsd module can change the state *to* recompute. */ - assert(new_state != tsd_state_nominal_recompute); + assert(!tsd_booted_get() || tsd_get(false) == tsd + || (tsd_get_allocates() && tsd_get(false) == NULL)); uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED); if (old_state > tsd_state_nominal_max) { /* @@ -162,10 +159,10 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) { &tsd->state, new_state, ATOMIC_RELAXED); } else { /* - * This is the tricky case. We're transitioning from - * one nominal state to another. The caller can't know - * about any races that are occurring at the same time, - * so we always have to recompute no matter what. + * We're transitioning from one nominal state to + * another. Recompute the state from the underlying + * slow-path data rather than trusting the caller's + * requested nominal state. */ tsd_slow_update(tsd); } @@ -234,13 +231,10 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) { if (tsd_state_get(tsd) == tsd_state_nominal_slow) { /* - * On slow path but no work needed. Note that we can't - * necessarily *assert* that we're slow, because we might be - * slow because of an asynchronous modification to global state, - * which might be asynchronously modified *back*. + * On slow path but no work needed. Local slow-path state may + * have changed since we computed the TSD state, so don't assert + * on the underlying cause here. */ - } else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) { - tsd_slow_update(tsd); } else if (tsd_state_get(tsd) == tsd_state_uninitialized) { if (!minimal) { if (tsd_booted) {