diff --git a/src/tcache.c b/src/tcache.c index 2114ff95..8bec5d6c 100644 --- a/src/tcache.c +++ b/src/tcache.c @@ -312,20 +312,9 @@ tcache_bin_flush_edatas_lookup(tsd_t *tsd, cache_bin_ptr_array_t *arr, } } -JEMALLOC_ALWAYS_INLINE bool -tcache_bin_flush_match(edata_t *edata, unsigned cur_arena_ind, - unsigned cur_binshard, bool small) { - if (small) { - return edata_arena_ind_get(edata) == cur_arena_ind - && edata_binshard_get(edata) == cur_binshard; - } else { - return edata_arena_ind_get(edata) == cur_arena_ind; - } -} - JEMALLOC_ALWAYS_INLINE void -tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, - szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) { +tcache_bin_flush_impl_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) { tcache_slow_t *tcache_slow = tcache->tcache_slow; /* * A couple lookup calls take tsdn; declare it once for convenience @@ -333,11 +322,7 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, */ tsdn_t *tsdn = tsd_tsdn(tsd); - if (small) { - assert(binind < SC_NBINS); - } else { - assert(binind < tcache_nbins_get(tcache_slow)); - } + assert(binind < SC_NBINS); arena_t *tcache_arena = tcache_slow->arena; assert(tcache_arena != NULL); @@ -368,34 +353,19 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, unsigned cur_arena_ind = edata_arena_ind_get(edata); arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); + unsigned cur_binshard = edata_binshard_get(edata); + bin_t *cur_bin = arena_get_bin(cur_arena, binind, + cur_binshard); + assert(cur_binshard < bin_infos[binind].n_shards); /* - * These assignments are always overwritten when small is true, - * and their values are always ignored when small is false, but - * to avoid the technical UB when we pass them as parameters, we - * need to intialize them. + * If you're looking at profiles, you might think this + * is a good place to prefetch the bin stats, which are + * often a cache miss. This turns out not to be + * helpful on the workloads we've looked at, with moving + * the bin stats next to the lock seeming to do better. */ - unsigned cur_binshard = 0; - bin_t *cur_bin = NULL; - if (small) { - cur_binshard = edata_binshard_get(edata); - cur_bin = arena_get_bin(cur_arena, binind, - cur_binshard); - assert(cur_binshard < bin_infos[binind].n_shards); - /* - * If you're looking at profiles, you might think this - * is a good place to prefetch the bin stats, which are - * often a cache miss. This turns out not to be - * helpful on the workloads we've looked at, with moving - * the bin stats next to the lock seeming to do better. - */ - } - if (small) { - malloc_mutex_lock(tsdn, &cur_bin->lock); - } - if (!small && !arena_is_auto(cur_arena)) { - malloc_mutex_lock(tsdn, &cur_arena->large_mtx); - } + malloc_mutex_lock(tsdn, &cur_bin->lock); /* * If we acquired the right lock and have some stats to flush, @@ -404,53 +374,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, if (config_stats && tcache_arena == cur_arena && !merged_stats) { merged_stats = true; - if (small) { - cur_bin->stats.nflushes++; - cur_bin->stats.nrequests += - cache_bin->tstats.nrequests; - cache_bin->tstats.nrequests = 0; - } else { - arena_stats_large_flush_nrequests_add(tsdn, - &tcache_arena->stats, binind, - cache_bin->tstats.nrequests); - cache_bin->tstats.nrequests = 0; - } - } - - /* - * Large allocations need special prep done. Afterwards, we can - * drop the large lock. - */ - if (!small) { - for (unsigned i = 0; i < nflush; i++) { - void *ptr = ptrs->ptr[i]; - edata = item_edata[i].edata; - assert(ptr != NULL && edata != NULL); - - if (tcache_bin_flush_match(edata, cur_arena_ind, - cur_binshard, small)) { - large_dalloc_prep_locked(tsdn, - edata); - } - } - } - if (!small && !arena_is_auto(cur_arena)) { - malloc_mutex_unlock(tsdn, &cur_arena->large_mtx); + cur_bin->stats.nflushes++; + cur_bin->stats.nrequests += + cache_bin->tstats.nrequests; + cache_bin->tstats.nrequests = 0; } /* Deallocate whatever we can. */ unsigned ndeferred = 0; /* Init only to avoid used-uninitialized warning. */ arena_dalloc_bin_locked_info_t dalloc_bin_info = {0}; - if (small) { - arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); - } + arena_dalloc_bin_locked_begin(&dalloc_bin_info, binind); for (unsigned i = 0; i < nflush; i++) { void *ptr = ptrs->ptr[i]; edata = item_edata[i].edata; assert(ptr != NULL && edata != NULL); - if (!tcache_bin_flush_match(edata, cur_arena_ind, - cur_binshard, small)) { + if (edata_arena_ind_get(edata) != cur_arena_ind + || edata_binshard_get(edata) != cur_binshard) { /* * The object was allocated either via a * different arena, or a different bin in this @@ -462,34 +402,23 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, ndeferred++; continue; } - if (small) { - if (arena_dalloc_bin_locked_step(tsdn, - cur_arena, cur_bin, &dalloc_bin_info, - binind, edata, ptr)) { - dalloc_slabs[dalloc_count] = edata; - dalloc_count++; - } - } else { - if (large_dalloc_safety_checks(edata, ptr, - binind)) { - /* See the comment in isfree. */ - continue; - } - large_dalloc_finish(tsdn, edata); + if (arena_dalloc_bin_locked_step(tsdn, + cur_arena, cur_bin, &dalloc_bin_info, + binind, edata, ptr)) { + dalloc_slabs[dalloc_count] = edata; + dalloc_count++; } } - if (small) { - arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, - &dalloc_bin_info); - malloc_mutex_unlock(tsdn, &cur_bin->lock); - } + arena_dalloc_bin_locked_finish(tsdn, cur_arena, cur_bin, + &dalloc_bin_info); + malloc_mutex_unlock(tsdn, &cur_bin->lock); + arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); nflush = ndeferred; } /* Handle all deferred slab dalloc. */ - assert(small || dalloc_count == 0); for (unsigned i = 0; i < dalloc_count; i++) { edata_t *slab = dalloc_slabs[i]; arena_slab_dalloc(tsdn, arena_get_from_edata(slab), slab); @@ -497,7 +426,6 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, } if (config_stats && !merged_stats) { - if (small) { /* * The flush loop didn't happen to flush to this * thread's arena, so the stats didn't get merged. @@ -510,14 +438,132 @@ tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, bin->stats.nrequests += cache_bin->tstats.nrequests; cache_bin->tstats.nrequests = 0; malloc_mutex_unlock(tsdn, &bin->lock); - } else { + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush) { + tcache_slow_t *tcache_slow = tcache->tcache_slow; + /* + * A couple lookup calls take tsdn; declare it once for convenience + * instead of calling tsd_tsdn(tsd) all the time. + */ + tsdn_t *tsdn = tsd_tsdn(tsd); + + assert(binind < tcache_nbins_get(tcache_slow)); + arena_t *tcache_arena = tcache_slow->arena; + assert(tcache_arena != NULL); + + /* + * Variable length array must have > 0 length; the last element is never + * touched (it's just included to satisfy the no-zero-length rule). + */ + VARIABLE_ARRAY(emap_batch_lookup_result_t, item_edata, nflush + 1); + tcache_bin_flush_edatas_lookup(tsd, ptrs, binind, nflush, item_edata); + + /* + * We're about to grab a bunch of locks. If one of them happens to be + * the one guarding the arena-level stats counters we flush our + * thread-local ones to, we do so under one critical section. + */ + bool merged_stats = false; + while (nflush > 0) { + /* Lock the arena, or bin, associated with the first object. */ + edata_t *edata = item_edata[0].edata; + unsigned cur_arena_ind = edata_arena_ind_get(edata); + arena_t *cur_arena = arena_get(tsdn, cur_arena_ind, false); + + if (!arena_is_auto(cur_arena)) { + malloc_mutex_lock(tsdn, &cur_arena->large_mtx); + } + + /* + * If we acquired the right lock and have some stats to flush, + * flush them. + */ + if (config_stats && tcache_arena == cur_arena + && !merged_stats) { + merged_stats = true; arena_stats_large_flush_nrequests_add(tsdn, &tcache_arena->stats, binind, cache_bin->tstats.nrequests); cache_bin->tstats.nrequests = 0; } + + /* + * Large allocations need special prep done. Afterwards, we can + * drop the large lock. + */ + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + + if (edata_arena_ind_get(edata) == cur_arena_ind) { + large_dalloc_prep_locked(tsdn, + edata); + } + } + if (!arena_is_auto(cur_arena)) { + malloc_mutex_unlock(tsdn, &cur_arena->large_mtx); + } + + /* Deallocate whatever we can. */ + unsigned ndeferred = 0; + for (unsigned i = 0; i < nflush; i++) { + void *ptr = ptrs->ptr[i]; + edata = item_edata[i].edata; + assert(ptr != NULL && edata != NULL); + if (edata_arena_ind_get(edata) != cur_arena_ind) { + /* + * The object was allocated either via a + * different arena, or a different bin in this + * arena. Either way, stash the object so that + * it can be handled in a future pass. + */ + ptrs->ptr[ndeferred] = ptr; + item_edata[ndeferred].edata = edata; + ndeferred++; + continue; + } + if (large_dalloc_safety_checks(edata, ptr, binind)) { + /* See the comment in isfree. */ + continue; + } + large_dalloc_finish(tsdn, edata); + } + arena_decay_ticks(tsdn, cur_arena, nflush - ndeferred); + nflush = ndeferred; } + if (config_stats && !merged_stats) { + arena_stats_large_flush_nrequests_add(tsdn, + &tcache_arena->stats, binind, + cache_bin->tstats.nrequests); + cache_bin->tstats.nrequests = 0; + } +} + +JEMALLOC_ALWAYS_INLINE void +tcache_bin_flush_impl(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, + szind_t binind, cache_bin_ptr_array_t *ptrs, unsigned nflush, bool small) { + /* + * The small/large flush logic is very similar; you might conclude that + * it's a good opportunity to share code. We've tried this, and by and + * large found this to obscure more than it helps; there are so many + * fiddly bits around things like stats handling, precisely when and + * which mutexes are acquired, etc., that almost all code ends up being + * gated behind 'if (small) { ... } else { ... }'. Even though the + * '...' is morally equivalent, the code itself needs slight tweaks. + */ + if (small) { + tcache_bin_flush_impl_small(tsd, tcache, cache_bin, binind, + ptrs, nflush); + } else { + tcache_bin_flush_impl_large(tsd, tcache, cache_bin, binind, + ptrs, nflush); + } } JEMALLOC_ALWAYS_INLINE void @@ -556,13 +602,15 @@ tcache_bin_flush_bottom(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, void tcache_bin_flush_small(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, unsigned rem) { - tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, true); + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, + /* small */ true); } void tcache_bin_flush_large(tsd_t *tsd, tcache_t *tcache, cache_bin_t *cache_bin, szind_t binind, unsigned rem) { - tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, false); + tcache_bin_flush_bottom(tsd, tcache, cache_bin, binind, rem, + /* small */ false); } /*