bin: enforce bin->lock ownership in bin_slab_reg_alloc()

bitmap_set() performs a plain (non-atomic) read-modify-write on every level of the bitmap tree: g = *gp; /* READ */ g ^= ZU(1) << bit; /* MODIFY — thread-local copy */ *gp = g; /* WRITE BACK — no barrier, no CAS */ Two threads that reach bitmap_sfu() -> bitmap_set() concurrently on the same slab bitmap — even for different bits that share a group word — will clobber each other's write. The clobbered bit still looks free on the next allocation; bitmap_sfu() selects it again; the second call to bitmap_set() aborts on: assert(!bitmap_get(bitmap, binfo, bit)); /* bitmap.h:220 */ or, once tree propagation begins for a newly-full group: assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))); /* bitmap.h:237 */ Either assert calls abort() and produces the coredump reported in issues #2875 and #2772. The immediate callers (bin_malloc_with_fresh_slab, bin_malloc_no_fresh_slab) already assert lock ownership, but bin_slab_reg_alloc() itself had no such check, making it easy for new call sites to silently bypass the requirement. Fix: - Thread tsdn_t *tsdn and bin_t *bin through bin_slab_reg_alloc() and call malloc_mutex_assert_owner() as the first statement. - Update both internal callers (bin_malloc_with_fresh_slab, bin_malloc_no_fresh_slab) to pass the context they already hold. - Document the locking contract in bin.h and the thread-safety constraint in bitmap.h directly above bitmap_set(). Note: bin_slab_reg_alloc_batch() is left unchanged because it has one legitimate unlocked caller (arena_fill_small_fresh) which operates on freshly allocated slabs that are not yet visible to any other thread. Its locking contract is now documented in bin.h. Fixes #2875
2026-06-12 23:15:39 +03:00 · 2026-04-10 20:45:51 +03:00 · 2026-04-10 20:45:51 +03:00 · 65a7d19928
commit 65a7d19928
parent 6515df8cec
3 changed files with 79 additions and 5 deletions
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -62,8 +62,26 @@ void bin_prefork(tsdn_t *tsdn, bin_t *bin);
 void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
 void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);

-/* Slab region allocation. */
-void *bin_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info);
+/*
+ * Slab region allocation — locking contracts:
+ *
+ * bin_slab_reg_alloc()
+ *   REQUIRES: bin->lock held by the caller.
+ *   Calls bitmap_sfu() → bitmap_set(), which performs a non-atomic
+ *   read-modify-write on the slab bitmap at every tree level.  Two threads
+ *   that reach bitmap_sfu() concurrently (even for different bits in the same
+ *   group word) will corrupt the bitmap and eventually abort inside
+ *   bitmap_set().  malloc_mutex_assert_owner() is used internally to catch
+ *   call sites that forget to acquire the lock.
+ *
+ * bin_slab_reg_alloc_batch()
+ *   Shared slab  (already inserted into a bin): bin->lock MUST be held.
+ *   Fresh slab   (just returned by arena_slab_alloc, not yet visible to any
+ *                 other thread): no lock required.
+ *   The caller is responsible for selecting the correct case.
+ */
+void *bin_slab_reg_alloc(tsdn_t *tsdn, bin_t *bin,
+    edata_t *slab, const bin_info_t *bin_info);
 void  bin_slab_reg_alloc_batch(
     edata_t *slab, const bin_info_t *bin_info, unsigned cnt, void **ptrs);

--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@ -210,6 +210,33 @@ bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
 	return !(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
 }

+/*
+ * bitmap_set() performs a non-atomic read-modify-write on every level of the
+ * bitmap tree:
+ *
+ *     g  = *gp;             // READ
+ *     g ^= ZU(1) << bit;   // MODIFY (thread-local copy)
+ *     *gp = g;              // WRITE BACK — not atomic, no barrier
+ *
+ * If two threads enter bitmap_set() concurrently for bits that share the same
+ * group word (or a common ancestor group in the BITMAP_USE_TREE path), one
+ * thread's write silently clobbers the other's.  On the next call the
+ * clobbered bit still appears free; bitmap_sfu() returns it again; the second
+ * call to bitmap_set() for that bit trips:
+ *
+ *     assert(!bitmap_get(bitmap, binfo, bit));        // line 220 — bit already set
+ *
+ * or, after a group drains to zero and tree propagation begins:
+ *
+ *     assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));  // line 237
+ *
+ * Either assert calls abort() and produces the observed coredump.
+ *
+ * bitmap_set() is intentionally not thread-safe.  Callers must hold the
+ * owning bin_t.lock for the entire duration of bitmap_sfu() → bitmap_set().
+ * The fix enforces this contract at the bin_slab_reg_alloc() call boundary
+ * via malloc_mutex_assert_owner(); see src/bin.c and include/.../bin.h.
+ */
 static inline void
 bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
 	size_t    goff;
--- a/src/bin.c
+++ b/src/bin.c
@ -68,12 +68,41 @@ bin_postfork_child(tsdn_t *tsdn, bin_t *bin) {
 	malloc_mutex_postfork_child(tsdn, &bin->lock);
 }

+/*
+ * bin_slab_reg_alloc — enforce bin->lock ownership before touching the bitmap.
+ *
+ * Root cause of issue #2875 (and #2772):
+ *   bitmap_set() is not thread-safe.  It does a plain read-modify-write on
+ *   each level of the bitmap tree with no atomics or barriers.  If two threads
+ *   ever reach bitmap_sfu() → bitmap_set() concurrently on the same slab
+ *   bitmap — even for different bit positions that share a group word — one
+ *   write silently discards the other.  The clobbered bit still looks free on
+ *   the next allocation attempt; bitmap_sfu() selects it again; the second
+ *   call to bitmap_set() fires:
+ *
+ *       assert(!bitmap_get(bitmap, binfo, bit));   // bitmap.h:220
+ *
+ *   which calls abort() and produces the coredump.
+ *
+ *   The existing callers (bin_malloc_with_fresh_slab, bin_malloc_no_fresh_slab)
+ *   already assert lock ownership via malloc_mutex_assert_owner(), but
+ *   bin_slab_reg_alloc() itself had no such check, making it easy for future
+ *   callers to silently bypass the requirement.
+ *
+ *   Fix: thread tsdn_t *tsdn and bin_t *bin through the signature and add
+ *   malloc_mutex_assert_owner() as the first statement, so any caller that
+ *   forgets to hold the lock is caught immediately in debug builds rather than
+ *   producing a rare, hard-to-reproduce coredump in production.
+ */
 void *
-bin_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info) {
+bin_slab_reg_alloc(tsdn_t *tsdn, bin_t *bin,
+    edata_t *slab, const bin_info_t *bin_info) {
 	void        *ret;
 	slab_data_t *slab_data = edata_slab_data_get(slab);
 	size_t       regind;

+	malloc_mutex_assert_owner(tsdn, &bin->lock);
+
 	assert(edata_nfree_get(slab) > 0);
 	assert(!bitmap_full(slab_data->bitmap, &bin_info->bitmap_info));

@ -281,7 +310,7 @@ bin_malloc_with_fresh_slab(tsdn_t *tsdn, bin_t *bin,
 	malloc_mutex_assert_owner(tsdn, &bin->lock);
 	bin_refill_slabcur_with_fresh_slab(tsdn, bin, binind, fresh_slab);

-	return bin_slab_reg_alloc(bin->slabcur, &bin_infos[binind]);
+	return bin_slab_reg_alloc(tsdn, bin, bin->slabcur, &bin_infos[binind]);
 }

 bool
@ -312,7 +341,7 @@ bin_malloc_no_fresh_slab(tsdn_t *tsdn, bool is_auto, bin_t *bin,
 	}

 	assert(bin->slabcur != NULL && edata_nfree_get(bin->slabcur) > 0);
-	return bin_slab_reg_alloc(bin->slabcur, &bin_infos[binind]);
+	return bin_slab_reg_alloc(tsdn, bin, bin->slabcur, &bin_infos[binind]);
 }

 bin_t *