diff --git a/include/jemalloc/internal/cache_bin.h b/include/jemalloc/internal/cache_bin.h index cc72af6b..0fb08421 100644 --- a/include/jemalloc/internal/cache_bin.h +++ b/include/jemalloc/internal/cache_bin.h @@ -35,67 +35,53 @@ struct cache_bin_stats_s { */ typedef struct cache_bin_info_s cache_bin_info_t; struct cache_bin_info_s { - /* The size of the bin stack, i.e. ncached_max * sizeof(ptr). */ - cache_bin_sz_t stack_size; + cache_bin_sz_t ncached_max; }; typedef struct cache_bin_s cache_bin_t; struct cache_bin_s { /* - * The cache bin stack is represented using 3 pointers: cur_ptr, - * low_water and full, optimized for the fast path efficiency. - * - * low addr ==> high addr - * |----|----|----|item1|item2|.....................|itemN| - * full cur empty - * (ncached == N; full + ncached_max == empty) - * - * Data directly stored: - * 1) cur_ptr points to the current item to be allocated, i.e. *cur_ptr. - * 2) full points to the top of the stack (i.e. ncached == ncached_max), - * which is compared against on free_fastpath to check "is_full". - * 3) low_water indicates a low water mark of ncached. - * Range of low_water is [cur, empty], i.e. values of [ncached, 0]. - * - * The empty position (ncached == 0) is derived via full + ncached_max - * and not accessed in the common case (guarded behind low_water). - * - * On 64-bit, 2 of the 3 pointers (full and low water) are compressed by - * omitting the high 32 bits. Overflow of the half pointers is avoided - * when allocating / initializing the stack space. As a result, - * cur_ptr.lowbits can be safely used for pointer comparisons. + * The stack grows down. Whenever the bin is nonempty, the head points + * to an array entry containing a valid allocation. When it is empty, + * the head points to one element past the owned array. */ - union { - void **ptr; - struct { - /* highbits never accessed directly. */ -#if (LG_SIZEOF_PTR == 3 && defined(JEMALLOC_BIG_ENDIAN)) - uint32_t __highbits; -#endif - uint32_t lowbits; -#if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN)) - uint32_t __highbits; -#endif - }; - } cur_ptr; + void **stack_head; + + /* + * The low bits of the address of the first item in the stack that + * hasn't been used since the last GC, to track the low water mark (min + * # of cached items). + * + * Since the stack grows down, this is a higher address than + * low_bits_full. + */ + uint16_t low_bits_low_water; + + /* + * The low bits of the value that stack_head will take on when the array + * is full. (But remember that stack_head always points to a valid item + * when the array is nonempty -- this is in the array). + * + * Recall that since the stack grows down, this is the lowest address in + * the array. + */ + uint16_t low_bits_full; + + /* + * The low bits of the value that stack_head will take on when the array + * is empty. + * + * The stack grows down -- this is one past the highest address in the + * array. + */ + uint16_t low_bits_empty; + /* * cur_ptr and stats are both modified frequently. Let's keep them * close so that they have a higher chance of being on the same * cacheline, thus less write-backs. */ cache_bin_stats_t tstats; - /* - * Points to the first item that hasn't been used since last GC, to - * track the low water mark (min # of cached). - */ - uint32_t low_water_position; - /* - * Points to the position when the cache is full. - * - * To make use of adjacent cacheline prefetch, the items in the avail - * stack goes to higher address for newer allocations (i.e. cur_ptr++). - */ - uint32_t full_position; }; typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t; @@ -118,30 +104,51 @@ struct cache_bin_array_descriptor_s { /* Returns ncached_max: Upper limit on ncached. */ static inline cache_bin_sz_t cache_bin_info_ncached_max(cache_bin_info_t *info) { - return info->stack_size / sizeof(void *); + return info->ncached_max; } +/* + * Asserts that the pointer associated with earlier is <= the one associated + * with later. + */ +static inline void +cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) { + if (earlier > later) { + assert(bin->low_bits_full > bin->low_bits_empty); + } +} + +/* + * Internal -- does difference calculations that handle wraparound correctly. + * Earlier must be associated with the position earlier in memory. + */ +static inline uint16_t +cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later) { + cache_bin_assert_earlier(bin, earlier, later); + return later - earlier; +} + + static inline cache_bin_sz_t cache_bin_ncached_get(cache_bin_t *bin, cache_bin_info_t *info) { - cache_bin_sz_t n = (cache_bin_sz_t)((info->stack_size + - bin->full_position - bin->cur_ptr.lowbits) / sizeof(void *)); + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty); + cache_bin_sz_t n = diff / sizeof(void *); + assert(n <= cache_bin_info_ncached_max(info)); - assert(n == 0 || *(bin->cur_ptr.ptr) != NULL); + assert(n == 0 || *(bin->stack_head) != NULL); return n; } static inline void ** cache_bin_empty_position_get(cache_bin_t *bin, cache_bin_info_t *info) { - void **ret = bin->cur_ptr.ptr + cache_bin_ncached_get(bin, info); - /* Low bits overflow disallowed when allocating the space. */ - assert((uint32_t)(uintptr_t)ret >= bin->cur_ptr.lowbits); + cache_bin_sz_t diff = cache_bin_diff(bin, + (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty); + uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff; + void **ret = (void **)empty_bits; - /* Can also be computed via (full_position + ncached_max) | highbits. */ - uintptr_t lowbits = bin->full_position + info->stack_size; - uintptr_t highbits = (uintptr_t)bin->cur_ptr.ptr & - ~(((uint64_t)1 << 32) - 1); - assert(ret == (void **)(lowbits | highbits)); + assert(ret >= bin->stack_head); return ret; } @@ -149,20 +156,29 @@ cache_bin_empty_position_get(cache_bin_t *bin, cache_bin_info_t *info) { static inline void cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) { assert(cache_bin_ncached_get(bin, info) == 0); - assert(cache_bin_empty_position_get(bin, info) == bin->cur_ptr.ptr); + assert(cache_bin_empty_position_get(bin, info) == bin->stack_head); } +/* + * Get low water, but without any of the correctness checking we do for the + * caller-usable version, if we are temporarily breaking invariants (like + * ncached >= low_water during flush). + */ +static inline cache_bin_sz_t +cache_bin_low_water_get_internal(cache_bin_t *bin, cache_bin_info_t *info) { + return cache_bin_diff(bin, bin->low_bits_low_water, + bin->low_bits_empty) / sizeof(void *); +} /* Returns the numeric value of low water in [0, ncached]. */ static inline cache_bin_sz_t cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) { - cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info); - cache_bin_sz_t low_water = ncached_max - - (cache_bin_sz_t)((bin->low_water_position - bin->full_position) / - sizeof(void *)); - assert(low_water <= ncached_max); + cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin, info); + assert(low_water <= cache_bin_info_ncached_max(info)); assert(low_water <= cache_bin_ncached_get(bin, info)); - assert(bin->low_water_position >= bin->cur_ptr.lowbits); + + cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head, + bin->low_bits_low_water); return low_water; } @@ -173,20 +189,7 @@ cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) { */ static inline void cache_bin_low_water_set(cache_bin_t *bin) { - bin->low_water_position = bin->cur_ptr.lowbits; -} - -/* - * This is an internal implementation detail -- users should only affect ncached - * via single-item pushes or batch fills. - */ -static inline void -cache_bin_ncached_set(cache_bin_t *bin, cache_bin_info_t *info, - cache_bin_sz_t n) { - bin->cur_ptr.lowbits = bin->full_position + info->stack_size - - n * sizeof(void *); - assert(n <= cache_bin_info_ncached_max(info)); - assert(n == 0 || *bin->cur_ptr.ptr != NULL); + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; } static inline void @@ -198,38 +201,35 @@ cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor, } JEMALLOC_ALWAYS_INLINE void * -cache_bin_alloc_easy_impl(cache_bin_t *bin, cache_bin_info_t *info, - bool *success, const bool adjust_low_water) { +cache_bin_alloc_easy_impl(cache_bin_t *bin, bool *success, + const bool adjust_low_water) { /* * This may read from the empty position; however the loaded value won't * be used. It's safe because the stack has one more slot reserved. */ - void *ret = *(bin->cur_ptr.ptr++); + void *ret = *bin->stack_head; + uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head; + void **new_head = bin->stack_head + 1; /* - * Check for both bin->ncached == 0 and ncached < low_water in a single - * branch. When adjust_low_water is true, this also avoids accessing - * the cache_bin_info_t (which is on a separate cacheline / page) in - * the common case. + * Note that the low water mark is at most empty; if we pass this check, + * we know we're non-empty. */ - if (unlikely(bin->cur_ptr.lowbits > bin->low_water_position)) { + if (unlikely(low_bits == bin->low_bits_low_water)) { if (adjust_low_water) { - uint32_t empty_position = bin->full_position + - info->stack_size; - if (unlikely(bin->cur_ptr.lowbits > empty_position)) { - /* Over-allocated; revert. */ - bin->cur_ptr.ptr--; - assert(bin->cur_ptr.lowbits == empty_position); + if (unlikely(low_bits == bin->low_bits_empty)) { *success = false; return NULL; } - bin->low_water_position = bin->cur_ptr.lowbits; + /* Overflow should be impossible. */ + assert(bin->low_bits_low_water + < (uint16_t)(uintptr_t)new_head); + bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head; } else { - bin->cur_ptr.ptr--; - assert(bin->cur_ptr.lowbits == bin->low_water_position); *success = false; return NULL; } } + bin->stack_head = new_head; /* * success (instead of ret) should be checked upon the return of this @@ -246,22 +246,27 @@ cache_bin_alloc_easy_impl(cache_bin_t *bin, cache_bin_info_t *info, JEMALLOC_ALWAYS_INLINE void * cache_bin_alloc_easy_reduced(cache_bin_t *bin, bool *success) { /* We don't look at info if we're not adjusting low-water. */ - return cache_bin_alloc_easy_impl(bin, NULL, success, false); + return cache_bin_alloc_easy_impl(bin, success, false); } JEMALLOC_ALWAYS_INLINE void * cache_bin_alloc_easy(cache_bin_t *bin, cache_bin_info_t *info, bool *success) { - return cache_bin_alloc_easy_impl(bin, info, success, true); + /* We don't use info now, but we may want to in the future. */ + (void)info; + return cache_bin_alloc_easy_impl(bin, success, true); } JEMALLOC_ALWAYS_INLINE bool cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) { - if (unlikely(bin->cur_ptr.lowbits == bin->full_position)) { + uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head; + if (unlikely(low_bits == bin->low_bits_full)) { return false; } - *(--bin->cur_ptr.ptr) = ptr; - assert(bin->cur_ptr.lowbits >= bin->full_position); + bin->stack_head--; + *bin->stack_head = ptr; + cache_bin_assert_earlier(bin, bin->low_bits_full, + (uint16_t)(uintptr_t)bin->stack_head); return true; } @@ -279,8 +284,8 @@ struct cache_bin_ptr_array_s { static inline void cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) { - arr->ptr = cache_bin_empty_position_get(bin, info) - nfill; assert(cache_bin_ncached_get(bin, info) == 0); + arr->ptr = cache_bin_empty_position_get(bin, info) - nfill; } /* @@ -292,12 +297,12 @@ static inline void cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) { assert(cache_bin_ncached_get(bin, info) == 0); + void **empty_position = cache_bin_empty_position_get(bin, info); if (nfilled < arr->n) { - void **empty_position = cache_bin_empty_position_get(bin, info); memmove(empty_position - nfilled, empty_position - arr->n, nfilled * sizeof(void *)); } - cache_bin_ncached_set(bin, info, nfilled); + bin->stack_head = empty_position - nfilled; } static inline void @@ -326,11 +331,12 @@ static inline void cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) { unsigned rem = cache_bin_ncached_get(bin, info) - nflushed; - memmove(bin->cur_ptr.ptr + nflushed, bin->cur_ptr.ptr, + memmove(bin->stack_head + nflushed, bin->stack_head, rem * sizeof(void *)); - cache_bin_ncached_set(bin, info, rem); - if (bin->cur_ptr.lowbits > bin->low_water_position) { - bin->low_water_position = bin->cur_ptr.lowbits; + bin->stack_head = bin->stack_head + nflushed; + if (cache_bin_ncached_get(bin, info) + < cache_bin_low_water_get_internal(bin, info)) { + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; } } diff --git a/src/cache_bin.c b/src/cache_bin.c index 94f3b32e..51b87499 100644 --- a/src/cache_bin.c +++ b/src/cache_bin.c @@ -8,7 +8,7 @@ cache_bin_info_init(cache_bin_info_t *info, cache_bin_sz_t ncached_max) { size_t stack_size = (size_t)ncached_max * sizeof(void *); assert(stack_size < ((size_t)1 << (sizeof(cache_bin_sz_t) * 8))); - info->stack_size = (cache_bin_sz_t)stack_size; + info->ncached_max = (cache_bin_sz_t)ncached_max; } void @@ -23,23 +23,14 @@ cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos, */ *size = sizeof(void *) * 2; for (szind_t i = 0; i < ninfos; i++) { - *size += infos[i].stack_size; + *size += infos[i].ncached_max * sizeof(void *); } /* - * 1) Align to at least PAGE, to minimize the # of TLBs needed by the + * Align to at least PAGE, to minimize the # of TLBs needed by the * smaller sizes; also helps if the larger sizes don't get used at all. - * 2) On 32-bit the pointers won't be compressed; use minimal alignment. */ - if (LG_SIZEOF_PTR < 3 || *size < PAGE) { - *alignment = PAGE; - } else { - /* - * Align pow2 to avoid overflow the cache bin compressed - * pointers. - */ - *alignment = pow2_ceil_zu(*size); - } + *alignment = PAGE; } void @@ -53,10 +44,6 @@ cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc, cache_bin_info_compute_alloc(infos, ninfos, &computed_size, &computed_alignment); assert(((uintptr_t)alloc & (computed_alignment - 1)) == 0); - - /* And that alignment should disallow overflow. */ - uint32_t lowbits = (uint32_t)((uintptr_t)alloc + computed_size); - assert((uint32_t)(uintptr_t)alloc < lowbits); } /* * Leave a noticeable mark pattern on the boundaries, in case a bug @@ -81,7 +68,6 @@ cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos, void *alloc, void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc, size_t *cur_offset) { - assert(sizeof(bin->cur_ptr) == sizeof(void *)); /* * The full_position points to the lowest available space. Allocations * will access the slots toward higher addresses (for the benefit of @@ -89,21 +75,23 @@ cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc, */ void *stack_cur = (void *)((uintptr_t)alloc + *cur_offset); void *full_position = stack_cur; - uint32_t bin_stack_size = info->stack_size; + uint16_t bin_stack_size = info->ncached_max * sizeof(void *); *cur_offset += bin_stack_size; void *empty_position = (void *)((uintptr_t)alloc + *cur_offset); /* Init to the empty position. */ - bin->cur_ptr.ptr = empty_position; - bin->low_water_position = bin->cur_ptr.lowbits; - bin->full_position = (uint32_t)(uintptr_t)full_position; - assert(bin->cur_ptr.lowbits - bin->full_position == bin_stack_size); + bin->stack_head = (void **)empty_position; + bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head; + bin->low_bits_full = (uint16_t)(uintptr_t)full_position; + bin->low_bits_empty = (uint16_t)(uintptr_t)empty_position; + assert(cache_bin_diff(bin, bin->low_bits_full, + (uint16_t)(uintptr_t) bin->stack_head) == bin_stack_size); assert(cache_bin_ncached_get(bin, info) == 0); assert(cache_bin_empty_position_get(bin, info) == empty_position); } bool cache_bin_still_zero_initialized(cache_bin_t *bin) { - return bin->cur_ptr.ptr == NULL; + return bin->stack_head == NULL; }