diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index 84101c65..91369dde 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -37,13 +37,12 @@ TSD_DATA_SLOWER
 JEMALLOC_ALWAYS_INLINE t *						\
 tsd_##n##p_get(tsd_t *tsd) {						\
 	/*								\
-	 * Because the state might change asynchronously if it's	\
-	 * nominal, we need to make sure that we only read it once.	\
+	 * Read the state once, so that a transition between		\
+	 * nominal states does not confuse this assertion.		\
 	 */								\
 	uint8_t state = tsd_state_get(tsd);				\
 	assert(state == tsd_state_nominal ||				\
 	    state == tsd_state_nominal_slow ||				\
-	    state == tsd_state_nominal_recompute ||			\
 	    state == tsd_state_reincarnated ||				\
 	    state == tsd_state_minimal_initialized);			\
 	return tsd_##n##p_get_unsafe(tsd);				\
@@ -95,11 +94,6 @@ TSD_DATA_SLOWER
 
 JEMALLOC_ALWAYS_INLINE void
 tsd_assert_fast(tsd_t *tsd) {
-	/*
-	 * Note that our fastness assertion does *not* include global slowness
-	 * counters; it's not in general possible to ensure that they won't
-	 * change asynchronously from underneath us.
-	 */
 	assert(!malloc_slow && tsd_tcache_enabled_get(tsd)
 	    && tsd_reentrancy_level_get(tsd) == 0);
 }
diff --git a/include/jemalloc/internal/tsd_internals.h b/include/jemalloc/internal/tsd_internals.h
index 062e2974..0e03c318 100644
--- a/include/jemalloc/internal/tsd_internals.h
+++ b/include/jemalloc/internal/tsd_internals.h
@@ -157,39 +157,29 @@ enum {
 	tsd_state_nominal = 0,
 	/* Initialized but on slow path. */
 	tsd_state_nominal_slow = 1,
-	/*
-	 * Some thread has changed global state in such a way that all nominal
-	 * threads need to recompute their fast / slow status the next time they
-	 * get a chance.
-	 *
-	 * Any thread can change another thread's status *to* recompute, but
-	 * threads are the only ones who can change their status *from*
-	 * recompute.
-	 */
-	tsd_state_nominal_recompute = 2,
 	/*
 	 * The above nominal states should be lower values.  We use
 	 * tsd_nominal_max to separate nominal states from threads in the
 	 * process of being born / dying.
 	 */
-	tsd_state_nominal_max = 2,
+	tsd_state_nominal_max = 1,
 
 	/*
 	 * A thread might free() during its death as its only allocator action;
 	 * in such scenarios, we need tsd, but set up in such a way that no
 	 * cleanup is necessary.
 	 */
-	tsd_state_minimal_initialized = 3,
+	tsd_state_minimal_initialized = 2,
 	/* States during which we know we're in thread death. */
-	tsd_state_purgatory = 4,
-	tsd_state_reincarnated = 5,
+	tsd_state_purgatory = 3,
+	tsd_state_reincarnated = 4,
 	/*
 	 * What it says on the tin; tsd that hasn't been initialized.  Note
 	 * that even when the tsd struct lives in TLS, when need to keep track
 	 * of stuff like whether or not our pthread destructors have been
 	 * scheduled, so this really truly is different than the nominal state.
 	 */
-	tsd_state_uninitialized = 6
+	tsd_state_uninitialized = 5
 };
 
 /*
@@ -203,12 +193,10 @@ enum {
 #	define tsd_state_t atomic_u8_t
 #	define tsd_atomic_load atomic_load_u8
 #	define tsd_atomic_store atomic_store_u8
-#	define tsd_atomic_exchange atomic_exchange_u8
 #else
 #	define tsd_state_t atomic_u32_t
 #	define tsd_atomic_load atomic_load_u32
 #	define tsd_atomic_store atomic_store_u32
-#	define tsd_atomic_exchange atomic_exchange_u32
 #endif
 
 /* The actual tsd. */
diff --git a/src/thread_event.c b/src/thread_event.c
index 82776342..a8c5e2e1 100644
--- a/src/thread_event.c
+++ b/src/thread_event.c
@@ -101,53 +101,6 @@ te_assert_invariants_debug(tsd_t *tsd) {
 	te_assert_invariants_impl(tsd, &ctx);
 }
 
-/*
- * Synchronization around the fast threshold in tsd --
- * There are two threads to consider in the synchronization here:
- * - The owner of the tsd being updated by a slow path change
- * - The remote thread, doing that slow path change.
- *
- * As a design constraint, we want to ensure that a slow-path transition cannot
- * be ignored for arbitrarily long, and that if the remote thread causes a
- * slow-path transition and then communicates with the owner thread that it has
- * occurred, then the owner will go down the slow path on the next allocator
- * operation (so that we don't want to just wait until the owner hits its slow
- * path reset condition on its own).
- *
- * Here's our strategy to do that:
- *
- * The remote thread will update the slow-path stores to TSD variables, issue a
- * SEQ_CST fence, and then update the TSD next_event_fast counter. The owner
- * thread will update next_event_fast, issue an SEQ_CST fence, and then check
- * its TSD to see if it's on the slow path.
-
- * This is fairly straightforward when 64-bit atomics are supported. Assume that
- * the remote fence is sandwiched between two owner fences in the reset pathway.
- * The case where there is no preceding or trailing owner fence (i.e. because
- * the owner thread is near the beginning or end of its life) can be analyzed
- * similarly. The owner store to next_event_fast preceding the earlier owner
- * fence will be earlier in coherence order than the remote store to it, so that
- * the owner thread will go down the slow path once the store becomes visible to
- * it, which is no later than the time of the second fence.
-
- * The case where we don't support 64-bit atomics is trickier, since word
- * tearing is possible. We'll repeat the same analysis, and look at the two
- * owner fences sandwiching the remote fence. The next_event_fast stores done
- * alongside the earlier owner fence cannot overwrite any of the remote stores
- * (since they precede the earlier owner fence in sb, which precedes the remote
- * fence in sc, which precedes the remote stores in sb). After the second owner
- * fence there will be a re-check of the slow-path variables anyways, so the
- * "owner will notice that it's on the slow path eventually" guarantee is
- * satisfied. To make sure that the out-of-band-messaging constraint is as well,
- * note that either the message passing is sequenced before the second owner
- * fence (in which case the remote stores happen before the second set of owner
- * stores, so malloc sees a value of zero for next_event_fast and goes down the
- * slow path), or it is not (in which case the owner sees the tsd slow-path
- * writes on its previous update). This leaves open the possibility that the
- * remote thread will (at some arbitrary point in the future) zero out one half
- * of the owner thread's next_event_fast, but that's always safe (it just sends
- * it down the slow path earlier).
- */
 static void
 te_ctx_next_event_fast_update(te_ctx_t *ctx) {
 	uint64_t next_event = te_ctx_next_event_get(ctx);
@@ -160,7 +113,6 @@ te_ctx_next_event_fast_update(te_ctx_t *ctx) {
 void
 te_recompute_fast_threshold(tsd_t *tsd) {
 	if (tsd_state_get(tsd) != tsd_state_nominal) {
-		/* Check first because this is also called on purgatory. */
 		te_next_event_fast_set_non_nominal(tsd);
 		return;
 	}
@@ -170,11 +122,6 @@ te_recompute_fast_threshold(tsd_t *tsd) {
 	te_ctx_next_event_fast_update(&ctx);
 	te_ctx_get(tsd, &ctx, false);
 	te_ctx_next_event_fast_update(&ctx);
-
-	atomic_fence(ATOMIC_SEQ_CST);
-	if (tsd_state_get(tsd) != tsd_state_nominal) {
-		te_next_event_fast_set_non_nominal(tsd);
-	}
 }
 
 static inline void
diff --git a/src/tsd.c b/src/tsd.c
index cb6edfdc..97c2065e 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -124,20 +124,17 @@ tsd_state_compute(tsd_t *tsd) {
 
 void
 tsd_slow_update(tsd_t *tsd) {
-	uint8_t old_state;
-	do {
-		uint8_t new_state = tsd_state_compute(tsd);
-		old_state = tsd_atomic_exchange(
-		    &tsd->state, new_state, ATOMIC_ACQUIRE);
-	} while (old_state == tsd_state_nominal_recompute);
+	assert(!tsd_booted_get() || tsd_get(false) == tsd
+	    || (tsd_get_allocates() && tsd_get(false) == NULL));
+	tsd_atomic_store(&tsd->state, tsd_state_compute(tsd), ATOMIC_RELAXED);
 
 	te_recompute_fast_threshold(tsd);
 }
 
 void
 tsd_state_set(tsd_t *tsd, uint8_t new_state) {
-	/* Only the tsd module can change the state *to* recompute. */
-	assert(new_state != tsd_state_nominal_recompute);
+	assert(!tsd_booted_get() || tsd_get(false) == tsd
+	    || (tsd_get_allocates() && tsd_get(false) == NULL));
 	uint8_t old_state = tsd_atomic_load(&tsd->state, ATOMIC_RELAXED);
 	if (old_state > tsd_state_nominal_max) {
 		/*
@@ -162,10 +159,10 @@ tsd_state_set(tsd_t *tsd, uint8_t new_state) {
 			    &tsd->state, new_state, ATOMIC_RELAXED);
 		} else {
 			/*
-			 * This is the tricky case.  We're transitioning from
-			 * one nominal state to another.  The caller can't know
-			 * about any races that are occurring at the same time,
-			 * so we always have to recompute no matter what.
+			 * We're transitioning from one nominal state to
+			 * another.  Recompute the state from the underlying
+			 * slow-path data rather than trusting the caller's
+			 * requested nominal state.
 			 */
 			tsd_slow_update(tsd);
 		}
@@ -234,13 +231,10 @@ tsd_fetch_slow(tsd_t *tsd, bool minimal) {
 
 	if (tsd_state_get(tsd) == tsd_state_nominal_slow) {
 		/*
-		 * On slow path but no work needed.  Note that we can't
-		 * necessarily *assert* that we're slow, because we might be
-		 * slow because of an asynchronous modification to global state,
-		 * which might be asynchronously modified *back*.
+		 * On slow path but no work needed.  Local slow-path state may
+		 * have changed since we computed the TSD state, so don't assert
+		 * on the underlying cause here.
 		 */
-	} else if (tsd_state_get(tsd) == tsd_state_nominal_recompute) {
-		tsd_slow_update(tsd);
 	} else if (tsd_state_get(tsd) == tsd_state_uninitialized) {
 		if (!minimal) {
 			if (tsd_booted) {