diff --git a/jemalloc/include/jemalloc/internal/arena.h b/jemalloc/include/jemalloc/internal/arena.h index 46d8fbc1..6855549d 100644 --- a/jemalloc/include/jemalloc/internal/arena.h +++ b/jemalloc/include/jemalloc/internal/arena.h @@ -121,8 +121,10 @@ struct arena_chunk_map_s { * * p : run page offset * s : run size + * c : size class (used only if prof_promote is true) * x : don't care * - : 0 + * + : 1 * [DZLA] : bit set * [dzla] : bit unset * @@ -142,17 +144,27 @@ struct arena_chunk_map_s { * pppppppp pppppppp pppp---- ----d--a * * Large: - * ssssssss ssssssss ssss---- ----D-la + * ssssssss ssssssss ssss++++ ++++D-la * xxxxxxxx xxxxxxxx xxxx---- ----xxxx * -------- -------- -------- ----D-la + * + * Large (sampled, size <= PAGE_SIZE): + * ssssssss ssssssss sssscccc ccccD-la + * + * Large (not sampled, size == PAGE_SIZE): + * ssssssss ssssssss ssss++++ ++++D-la */ size_t bits; -#define CHUNK_MAP_FLAGS_MASK ((size_t)0x1fU) -#define CHUNK_MAP_KEY ((size_t)0x10U) -#define CHUNK_MAP_DIRTY ((size_t)0x08U) -#define CHUNK_MAP_ZEROED ((size_t)0x04U) -#define CHUNK_MAP_LARGE ((size_t)0x02U) -#define CHUNK_MAP_ALLOCATED ((size_t)0x01U) +#ifdef JEMALLOC_PROF +#define CHUNK_MAP_CLASS_SHIFT 4 +#define CHUNK_MAP_CLASS_MASK ((size_t)0xff0U) +#endif +#define CHUNK_MAP_FLAGS_MASK ((size_t)0xfU) +#define CHUNK_MAP_DIRTY ((size_t)0x8U) +#define CHUNK_MAP_ZEROED ((size_t)0x4U) +#define CHUNK_MAP_LARGE ((size_t)0x2U) +#define CHUNK_MAP_ALLOCATED ((size_t)0x1U) +#define CHUNK_MAP_KEY CHUNK_MAP_ALLOCATED }; typedef rb_tree(arena_chunk_map_t) arena_avail_tree_t; typedef rb_tree(arena_chunk_map_t) arena_run_tree_t; @@ -421,6 +433,8 @@ void *arena_palloc(arena_t *arena, size_t alignment, size_t size, size_t alloc_size); size_t arena_salloc(const void *ptr); #ifdef JEMALLOC_PROF +void arena_prof_promoted(const void *ptr, size_t size); +size_t arena_salloc_demote(const void *ptr); prof_thr_cnt_t *arena_prof_cnt_get(const void *ptr); void arena_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt); #endif diff --git a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in index 4490117b..2c3f32f1 100644 --- a/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in +++ b/jemalloc/include/jemalloc/internal/jemalloc_internal.h.in @@ -515,7 +515,11 @@ isalloc(const void *ptr) /* Region. */ assert(chunk->arena->magic == ARENA_MAGIC); +#ifdef JEMALLOC_PROF + ret = arena_salloc_demote(ptr); +#else ret = arena_salloc(ptr); +#endif } else ret = huge_salloc(ptr); diff --git a/jemalloc/include/jemalloc/internal/prof.h b/jemalloc/include/jemalloc/internal/prof.h index e29a5742..0a5db297 100644 --- a/jemalloc/include/jemalloc/internal/prof.h +++ b/jemalloc/include/jemalloc/internal/prof.h @@ -134,6 +134,12 @@ extern bool opt_prof_leak; /* Dump leak summary at exit. */ */ extern uint64_t prof_interval; +/* + * If true, promote small sampled objects to large objects, since small run + * headers do not have embedded profile context pointers. + */ +extern bool prof_promote; + bool prof_init(prof_t *prof, bool master); void prof_destroy(prof_t *prof); diff --git a/jemalloc/include/jemalloc/internal/tcache.h b/jemalloc/include/jemalloc/internal/tcache.h index e314fead..c76597fa 100644 --- a/jemalloc/include/jemalloc/internal/tcache.h +++ b/jemalloc/include/jemalloc/internal/tcache.h @@ -256,6 +256,12 @@ tcache_alloc_large(tcache_t *tcache, size_t size, bool zero) if (ret == NULL) return (NULL); } else { +#ifdef JEMALLOC_PROF + arena_chunk_t *chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ret); + size_t pageind = (unsigned)(((uintptr_t)ret - (uintptr_t)chunk) + >> PAGE_SHIFT); + chunk->map[pageind].bits |= CHUNK_MAP_CLASS_MASK; +#endif if (zero == false) { #ifdef JEMALLOC_FILL if (opt_junk) @@ -289,6 +295,8 @@ tcache_dalloc_small(tcache_t *tcache, void *ptr) size_t pageind, binind; arena_chunk_map_t *mapelm; + assert(arena_salloc(ptr) <= small_maxclass); + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); arena = chunk->arena; pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); @@ -334,6 +342,8 @@ tcache_dalloc_large(tcache_t *tcache, void *ptr, size_t size) arena_chunk_map_t *mapelm; assert((size & PAGE_MASK) == 0); + assert(arena_salloc(ptr) > small_maxclass); + assert(arena_salloc(ptr) <= tcache_maxclass); chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); arena = chunk->arena; diff --git a/jemalloc/src/arena.c b/jemalloc/src/arena.c index 8b2ab0c8..b6140b5a 100644 --- a/jemalloc/src/arena.c +++ b/jemalloc/src/arena.c @@ -218,8 +218,8 @@ arena_avail_comp(arena_chunk_map_t *a, arena_chunk_map_t *b) size_t a_size = a->bits & ~PAGE_MASK; size_t b_size = b->bits & ~PAGE_MASK; - assert(a->bits & CHUNK_MAP_KEY || (a->bits & CHUNK_MAP_DIRTY) == - (b->bits & CHUNK_MAP_DIRTY)); + assert((a->bits & CHUNK_MAP_KEY) == CHUNK_MAP_KEY || (a->bits & + CHUNK_MAP_DIRTY) == (b->bits & CHUNK_MAP_DIRTY)); ret = (a_size > b_size) - (a_size < b_size); if (ret == 0) { @@ -382,6 +382,9 @@ arena_run_split(arena_t *arena, arena_run_t *run, size_t size, bool large, chunk->map[run_ind+need_pages-1].bits = CHUNK_MAP_LARGE | CHUNK_MAP_ALLOCATED | flag_dirty; chunk->map[run_ind].bits = size | CHUNK_MAP_LARGE | +#ifdef JEMALLOC_PROF + CHUNK_MAP_CLASS_MASK | +#endif CHUNK_MAP_ALLOCATED | flag_dirty; } else { assert(zero == false); @@ -1210,7 +1213,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size) try_nregs--; try_hdr_size = sizeof(arena_run_t); #ifdef JEMALLOC_PROF - if (opt_prof) { + if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ try_hdr_size = QUANTUM_CEILING(try_hdr_size); try_cnt0_offset = try_hdr_size; @@ -1243,7 +1246,7 @@ arena_bin_run_size_calc(arena_bin_t *bin, size_t min_run_size) try_nregs--; try_hdr_size = sizeof(arena_run_t); #ifdef JEMALLOC_PROF - if (opt_prof) { + if (opt_prof && prof_promote == false) { /* Pad to a quantum boundary. */ try_hdr_size = QUANTUM_CEILING(try_hdr_size); try_cnt0_offset = try_hdr_size; @@ -1507,6 +1510,63 @@ arena_salloc(const void *ptr) } #ifdef JEMALLOC_PROF +void +arena_prof_promoted(const void *ptr, size_t size) +{ + arena_chunk_t *chunk; + size_t pageind, binind; + + assert(ptr != NULL); + assert(CHUNK_ADDR2BASE(ptr) != ptr); + assert(isalloc(ptr) == PAGE_SIZE); + + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); + binind = small_size2bin[size]; + assert(binind < nbins); + chunk->map[pageind].bits = (chunk->map[pageind].bits & + ~CHUNK_MAP_CLASS_MASK) | (binind << CHUNK_MAP_CLASS_SHIFT); +} + +size_t +arena_salloc_demote(const void *ptr) +{ + size_t ret; + arena_chunk_t *chunk; + size_t pageind, mapbits; + + assert(ptr != NULL); + assert(CHUNK_ADDR2BASE(ptr) != ptr); + + chunk = (arena_chunk_t *)CHUNK_ADDR2BASE(ptr); + pageind = (((uintptr_t)ptr - (uintptr_t)chunk) >> PAGE_SHIFT); + mapbits = chunk->map[pageind].bits; + assert((mapbits & CHUNK_MAP_ALLOCATED) != 0); + if ((mapbits & CHUNK_MAP_LARGE) == 0) { + arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << + PAGE_SHIFT)); + assert(run->magic == ARENA_RUN_MAGIC); + assert(((uintptr_t)ptr - ((uintptr_t)run + + (uintptr_t)run->bin->reg0_offset)) % run->bin->reg_size == + 0); + ret = run->bin->reg_size; + } else { + assert(((uintptr_t)ptr & PAGE_MASK) == 0); + ret = mapbits & ~PAGE_MASK; + if (prof_promote && ret == PAGE_SIZE && (mapbits & + CHUNK_MAP_CLASS_MASK) != CHUNK_MAP_CLASS_MASK) { + size_t binind = ((mapbits & CHUNK_MAP_CLASS_MASK) >> + CHUNK_MAP_CLASS_SHIFT); + assert(binind < nbins); + ret = chunk->arena->bins[binind].reg_size; + } + assert(ret != 0); + } + + return (ret); +} + static inline unsigned arena_run_regind(arena_run_t *run, arena_bin_t *bin, const void *ptr, size_t size) @@ -1585,19 +1645,23 @@ arena_prof_cnt_get(const void *ptr) mapbits = chunk->map[pageind].bits; assert((mapbits & CHUNK_MAP_ALLOCATED) != 0); if ((mapbits & CHUNK_MAP_LARGE) == 0) { - arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + - (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << - PAGE_SHIFT)); - arena_bin_t *bin = run->bin; - unsigned regind; + if (prof_promote) + ret = (prof_thr_cnt_t *)(uintptr_t)1U; + else { + arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << + PAGE_SHIFT)); + arena_bin_t *bin = run->bin; + unsigned regind; - assert(run->magic == ARENA_RUN_MAGIC); - regind = arena_run_regind(run, bin, ptr, bin->reg_size); - ret = *(prof_thr_cnt_t **)((uintptr_t)run + bin->cnt0_offset + - (regind * sizeof(prof_thr_cnt_t *))); - } else { + assert(run->magic == ARENA_RUN_MAGIC); + regind = arena_run_regind(run, bin, ptr, bin->reg_size); + ret = *(prof_thr_cnt_t **)((uintptr_t)run + + bin->cnt0_offset + (regind * + sizeof(prof_thr_cnt_t *))); + } + } else ret = chunk->map[pageind].prof_cnt; - } return (ret); } @@ -1616,20 +1680,22 @@ arena_prof_cnt_set(const void *ptr, prof_thr_cnt_t *cnt) mapbits = chunk->map[pageind].bits; assert((mapbits & CHUNK_MAP_ALLOCATED) != 0); if ((mapbits & CHUNK_MAP_LARGE) == 0) { - arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + - (uintptr_t)((pageind - (mapbits & >> PAGE_SHIFT)) << - PAGE_SHIFT)); - arena_bin_t *bin = run->bin; - unsigned regind; + if (prof_promote == false) { + arena_run_t *run = (arena_run_t *)((uintptr_t)chunk + + (uintptr_t)((pageind - (mapbits >> PAGE_SHIFT)) << + PAGE_SHIFT)); + arena_bin_t *bin = run->bin; + unsigned regind; - assert(run->magic == ARENA_RUN_MAGIC); - regind = arena_run_regind(run, bin, ptr, bin->reg_size); + assert(run->magic == ARENA_RUN_MAGIC); + regind = arena_run_regind(run, bin, ptr, bin->reg_size); - *((prof_thr_cnt_t **)((uintptr_t)run + bin->cnt0_offset + - (regind * sizeof(prof_thr_cnt_t *)))) = cnt; - } else { + *((prof_thr_cnt_t **)((uintptr_t)run + bin->cnt0_offset + + (regind * sizeof(prof_thr_cnt_t *)))) = cnt; + } else + assert((uintptr_t)cnt == (uintptr_t)1U); + } else chunk->map[pageind].prof_cnt = cnt; - } } #endif @@ -2330,7 +2396,22 @@ arena_boot(void) * 4KiB pages), and such configurations are impractical, but * nonetheless we need to protect against this case in order to avoid * undefined behavior. + * + * Further constrain nbins to 255 if prof_promote is true, since all + * small size classes, plus a "not small" size class must be stored in + * 8 bits of arena_chunk_map_t's bits field. */ +#ifdef JEMALLOC_PROF + if (opt_prof && prof_promote) { + if (nbins > 255) { + char line_buf[UMAX2S_BUFSIZE]; + malloc_write(": Too many small size classes ("); + malloc_write(umax2s(nbins, 10, line_buf)); + malloc_write(" > max 255)\n"); + abort(); + } + } else +#endif if (nbins > 256) { char line_buf[UMAX2S_BUFSIZE]; malloc_write(": Too many small size classes ("); diff --git a/jemalloc/src/jemalloc.c b/jemalloc/src/jemalloc.c index d880769d..b30f2313 100644 --- a/jemalloc/src/jemalloc.c +++ b/jemalloc/src/jemalloc.c @@ -835,13 +835,21 @@ JEMALLOC_P(malloc)(size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { - ret = NULL; - goto OOM; - } + if (opt_prof) { + if ((cnt = prof_alloc_prep(size)) == NULL) { + ret = NULL; + goto OOM; + } + if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && size <= + small_maxclass) { + ret = imalloc(small_maxclass+1); + if (ret != NULL) + arena_prof_promoted(ret, size); + } else + ret = imalloc(size); + } else #endif - - ret = imalloc(size); + ret = imalloc(size); OOM: if (ret == NULL) { @@ -918,12 +926,24 @@ JEMALLOC_P(posix_memalign)(void **memptr, size_t alignment, size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { - result = NULL; - ret = EINVAL; + if (opt_prof) { + if ((cnt = prof_alloc_prep(size)) == NULL) { + result = NULL; + ret = EINVAL; + } else { + if (prof_promote && (uintptr_t)cnt != + (uintptr_t)1U && size <= small_maxclass) { + result = ipalloc(alignment, + small_maxclass+1); + if (result != NULL) { + arena_prof_promoted(result, + size); + } + } else + result = ipalloc(alignment, size); + } } else #endif - result = ipalloc(alignment, size); } @@ -992,13 +1012,21 @@ JEMALLOC_P(calloc)(size_t num, size_t size) } #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep(num_size)) == NULL) { - ret = NULL; - goto RETURN; - } + if (opt_prof) { + if ((cnt = prof_alloc_prep(num_size)) == NULL) { + ret = NULL; + goto RETURN; + } + if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && num_size + <= small_maxclass) { + ret = icalloc(small_maxclass+1); + if (ret != NULL) + arena_prof_promoted(ret, num_size); + } else + ret = icalloc(num_size); + } else #endif - - ret = icalloc(num_size); + ret = icalloc(num_size); RETURN: if (ret == NULL) { @@ -1071,10 +1099,16 @@ JEMALLOC_P(realloc)(void *ptr, size_t size) ret = NULL; goto OOM; } - } + if (prof_promote && (uintptr_t)cnt != (uintptr_t)1U && + size <= small_maxclass) { + ret = iralloc(ptr, small_maxclass+1); + if (ret != NULL) + arena_prof_promoted(ret, size); + } else + ret = iralloc(ptr, size); + } else #endif - - ret = iralloc(ptr, size); + ret = iralloc(ptr, size); #ifdef JEMALLOC_PROF OOM: @@ -1104,8 +1138,21 @@ OOM: ret = NULL; } else { #ifdef JEMALLOC_PROF - if (opt_prof && (cnt = prof_alloc_prep(size)) == NULL) { - ret = NULL; + if (opt_prof) { + if ((cnt = prof_alloc_prep(size)) == NULL) + ret = NULL; + else { + if (prof_promote && (uintptr_t)cnt != + (uintptr_t)1U && size <= + small_maxclass) { + ret = imalloc(small_maxclass+1); + if (ret != NULL) { + arena_prof_promoted(ret, + size); + } + } else + ret = imalloc(size); + } } else #endif ret = imalloc(size); diff --git a/jemalloc/src/prof.c b/jemalloc/src/prof.c index 1a667a98..80c81dac 100644 --- a/jemalloc/src/prof.c +++ b/jemalloc/src/prof.c @@ -25,6 +25,7 @@ bool opt_prof_udump = false; bool opt_prof_leak = false; uint64_t prof_interval; +bool prof_promote; /* * Global hash of (prof_bt_t *)-->(prof_ctx_t *). This is the master data @@ -1250,8 +1251,8 @@ prof_boot0(void) { /* - * opt_prof must be in its final state before any arenas are - * initialized, so this function must be executed early. + * opt_prof and prof_promote must be in their final state before any + * arenas are initialized, so this function must be executed early. */ if (opt_lg_prof_sample > 0) { @@ -1272,6 +1273,8 @@ prof_boot0(void) prof_interval = 0; } else if (opt_prof) prof_interval = (((uint64_t)1U) << opt_lg_prof_interval); + + prof_promote = (opt_prof && opt_lg_prof_sample > PAGE_SHIFT); } bool