diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h index 4579ab02..ba5933af 100644 --- a/include/jemalloc/internal/prof_externs.h +++ b/include/jemalloc/internal/prof_externs.h @@ -19,6 +19,7 @@ extern char opt_prof_prefix[ PATH_MAX + #endif 1]; +extern bool opt_prof_unbias; /* For recording recent allocations */ extern ssize_t opt_prof_recent_alloc_max; @@ -40,6 +41,9 @@ extern uint64_t prof_interval; * resets. */ extern size_t lg_prof_sample; +extern size_t prof_unbiased_sz[SC_NSIZES]; +extern size_t prof_shifted_unbiased_cnt[SC_NSIZES]; +void prof_unbias_map_init(); extern bool prof_booted; diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h index 26942aa6..fbad6145 100644 --- a/include/jemalloc/internal/prof_structs.h +++ b/include/jemalloc/internal/prof_structs.h @@ -24,9 +24,13 @@ typedef struct { struct prof_cnt_s { /* Profiling counters. */ uint64_t curobjs; + uint64_t curobjs_shifted_unbiased; uint64_t curbytes; + uint64_t curbytes_unbiased; uint64_t accumobjs; + uint64_t accumobjs_shifted_unbiased; uint64_t accumbytes; + uint64_t accumbytes_unbiased; }; typedef enum { diff --git a/src/jemalloc.c b/src/jemalloc.c index f2e5f8eb..ae9ef3d1 100644 --- a/src/jemalloc.c +++ b/src/jemalloc.c @@ -1517,6 +1517,16 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS], } CONF_CONTINUE; } + /* + * Undocumented. When set to false, don't + * correct for an unbiasing bug in jeprof + * attribution. This can be handy if you want + * to get consistent numbers from your binary + * across different jemalloc versions, even if + * those numbers are incorrect. The default is + * true. + */ + CONF_HANDLE_BOOL(opt_prof_unbias, "prof_unbias") } if (config_log) { if (CONF_MATCH("log")) { diff --git a/src/prof.c b/src/prof.c index 25735410..7b649e49 100644 --- a/src/prof.c +++ b/src/prof.c @@ -32,6 +32,7 @@ bool opt_prof_leak = false; bool opt_prof_accum = false; char opt_prof_prefix[PROF_DUMP_FILENAME_LEN]; bool opt_prof_sys_thread_name = false; +bool opt_prof_unbias = true; /* Accessed via prof_sample_event_handler(). */ static counter_accum_t prof_idump_accumulated; @@ -60,6 +61,8 @@ static malloc_mutex_t prof_gdump_mtx; uint64_t prof_interval = 0; size_t lg_prof_sample; +size_t prof_unbiased_sz[SC_NSIZES]; +size_t prof_shifted_unbiased_cnt[SC_NSIZES]; static uint64_t next_thr_uid; static malloc_mutex_t next_thr_uid_mtx; @@ -69,6 +72,40 @@ bool prof_booted = false; /******************************************************************************/ +void prof_unbias_map_init() { + /* See the comment in prof_sample_new_event_wait */ +#ifdef JEMALLOC_PROF + for (szind_t i = 0; i < SC_NSIZES; i++) { + double sz = (double)sz_index2size(i); + double rate = (double)(ZU(1) << lg_prof_sample); + double div_val = 1.0 - exp(-sz / rate); + double unbiased_sz = sz / div_val; + /* + * The "true" right value for the unbiased count is + * 1.0/(1 - exp(-sz/rate)). The problem is, we keep the counts + * as integers (for a variety of reasons -- rounding errors + * could trigger asserts, and not all libcs can properly handle + * floating point arithmetic during malloc calls inside libc). + * Rounding to an integer, though, can lead to rounding errors + * of over 30% for sizes close to the sampling rate. So + * instead, we multiply by a constant, dividing the maximum + * possible roundoff error by that constant. To avoid overflow + * in summing up size_t values, the largest safe constant we can + * pick is the size of the smallest allocation. + */ + double cnt_shift = (double)(ZU(1) << SC_LG_TINY_MIN); + double shifted_unbiased_cnt = cnt_shift / div_val; + prof_unbiased_sz[i] = (size_t)round(unbiased_sz); + prof_shifted_unbiased_cnt[i] = (size_t)round( + shifted_unbiased_cnt); + } +#else + unreachable(); +#endif +} + +/******************************************************************************/ + void prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) { cassert(config_prof); @@ -96,12 +133,30 @@ prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size, ptr); prof_info_set(tsd, edata, tctx); + szind_t szind = sz_size2index(size); + malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); + /* + * We need to do these map lookups while holding the lock, to avoid the + * possibility of races with prof_reset calls, which update the map and + * then acquire the lock. This actually still leaves a data race on the + * contents of the unbias map, but we have not yet gone through and + * atomic-ified the prof module, and compilers are not yet causing us + * issues. The key thing is to make sure that, if we read garbage data, + * the prof_reset call is about to mark our tctx as expired before any + * dumping of our corrupted output is attempted. + */ + size_t shifted_unbiased_cnt = prof_shifted_unbiased_cnt[szind]; + size_t unbiased_bytes = prof_unbiased_sz[szind]; tctx->cnts.curobjs++; + tctx->cnts.curobjs_shifted_unbiased += shifted_unbiased_cnt; tctx->cnts.curbytes += usize; + tctx->cnts.curbytes_unbiased += unbiased_bytes; if (opt_prof_accum) { tctx->cnts.accumobjs++; + tctx->cnts.accumobjs_shifted_unbiased += shifted_unbiased_cnt; tctx->cnts.accumbytes += usize; + tctx->cnts.accumbytes_unbiased += unbiased_bytes; } bool record_recent = prof_recent_alloc_prepare(tsd, tctx); tctx->prepared = false; @@ -118,12 +173,21 @@ prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info) { prof_tctx_t *tctx = prof_info->alloc_tctx; assert((uintptr_t)tctx > (uintptr_t)1U); + szind_t szind = sz_size2index(usize); malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock); assert(tctx->cnts.curobjs > 0); assert(tctx->cnts.curbytes >= usize); + /* + * It's not correct to do equivalent asserts for unbiased bytes, because + * of the potential for races with prof.reset calls. The map contents + * should really be atomic, but we have not atomic-ified the prof module + * yet. + */ tctx->cnts.curobjs--; + tctx->cnts.curobjs_shifted_unbiased -= prof_shifted_unbiased_cnt[szind]; tctx->cnts.curbytes -= usize; + tctx->cnts.curbytes_unbiased -= prof_unbiased_sz[szind]; prof_try_log(tsd, usize, prof_info); @@ -517,6 +581,7 @@ prof_boot2(tsd_t *tsd, base_t *base) { unsigned i; lg_prof_sample = opt_lg_prof_sample; + prof_unbias_map_init(); prof_active = opt_prof_active; if (malloc_mutex_init(&prof_active_mtx, "prof_active", diff --git a/src/prof_data.c b/src/prof_data.c index 6b441de1..ae9cd4b1 100644 --- a/src/prof_data.c +++ b/src/prof_data.c @@ -514,12 +514,121 @@ prof_dump_printf(write_cb_t *prof_dump_write, void *cbopaque, prof_dump_write(cbopaque, buf); } +/* + * Casting a double to a uint64_t may not necessarily be in range; this can be + * UB. I don't think this is practically possible with the cur counters, but + * plausibly could be with the accum counters. + */ +#ifdef JEMALLOC_PROF +static uint64_t +prof_double_uint64_cast(double d) { + /* + * Note: UINT64_MAX + 1 is exactly representable as a double on all + * reasonable platforms (certainly those we'll support). Writing this + * as !(a < b) instead of (a >= b) means that we're NaN-safe. + */ + double rounded = round(d); + if (!(rounded < (double)UINT64_MAX)) { + return UINT64_MAX; + } + return (uint64_t)rounded; +} +#endif + +/* + * The unbiasing story is long. The jeprof unbiasing logic was copied from + * pprof. Both shared an issue: they unbiased using the average size of the + * allocations at a particular stack trace. This can work out OK if allocations + * are mostly of the same size given some stack, but not otherwise. We now + * internally track what the unbiased results ought to be. We can't just report + * them as they are though; they'll still go through the jeprof unbiasing + * process. Instead, we figure out what values we can feed *into* jeprof's + * unbiasing mechanism that will lead to getting the right values out. + * + * It'll unbias count and aggregate size as: + * + * c_out = c_in * 1/(1-exp(-s_in/c_in/R) + * s_out = s_in * 1/(1-exp(-s_in/c_in/R) + * + * We want to solve for the values of c_in and s_in that will + * give the c_out and s_out that we've computed internally. + * + * Let's do a change of variables (both to make the math easier and to make it + * easier to write): + * x = s_in / c_in + * y = s_in + * k = 1/R. + * + * Then + * c_out = y/x * 1/(1-exp(-k*x)) + * s_out = y * 1/(1-exp(-k*x)) + * + * The first equation gives: + * y = x * c_out * (1-exp(-k*x)) + * The second gives: + * y = s_out * (1-exp(-k*x)) + * So we have + * x = s_out / c_out. + * And all the other values fall out from that. + * + * This is all a fair bit of work. The thing we get out of it is that we don't + * break backwards compatibility with jeprof (and the various tools that have + * copied its unbiasing logic). Eventually, we anticipate a v3 heap profile + * dump format based on JSON, at which point I think much of this logic can get + * cleaned up (since we'll be taking a compatibility break there anyways). + */ +static void +prof_do_unbias(uint64_t c_out_shifted_i, uint64_t s_out_i, uint64_t *r_c_in, + uint64_t *r_s_in) { +#ifdef JEMALLOC_PROF + if (c_out_shifted_i == 0 || s_out_i == 0) { + *r_c_in = 0; + *r_s_in = 0; + return; + } + /* + * See the note in prof_unbias_map_init() to see why we take c_out in a + * shifted form. + */ + double c_out = (double)c_out_shifted_i + / (double)(ZU(1) << SC_LG_TINY_MIN); + double s_out = (double)s_out_i; + double R = (double)(ZU(1) << lg_prof_sample); + + double x = s_out / c_out; + double y = s_out * (1.0 - exp(-x / R)); + + double c_in = y / x; + double s_in = y; + + *r_c_in = prof_double_uint64_cast(c_in); + *r_s_in = prof_double_uint64_cast(s_in); +#else + unreachable(); +#endif +} + static void prof_dump_print_cnts(write_cb_t *prof_dump_write, void *cbopaque, const prof_cnt_t *cnts) { + uint64_t curobjs; + uint64_t curbytes; + uint64_t accumobjs; + uint64_t accumbytes; + if (opt_prof_unbias) { + prof_do_unbias(cnts->curobjs_shifted_unbiased, + cnts->curbytes_unbiased, &curobjs, &curbytes); + prof_do_unbias(cnts->accumobjs_shifted_unbiased, + cnts->accumbytes_unbiased, &accumobjs, &accumbytes); + } else { + curobjs = cnts->curobjs; + curbytes = cnts->curbytes; + accumobjs = cnts->accumobjs; + accumbytes = cnts->accumbytes; + } prof_dump_printf(prof_dump_write, cbopaque, "%"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]", - cnts->curobjs, cnts->curbytes, cnts->accumobjs, cnts->accumbytes); + curobjs, curbytes, accumobjs, accumbytes); } static void @@ -539,12 +648,20 @@ prof_tctx_merge_tdata(tsdn_t *tsdn, prof_tctx_t *tctx, prof_tdata_t *tdata) { memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t)); tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + tdata->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + tdata->cnt_summed.curbytes_unbiased + += tctx->dump_cnts.curbytes_unbiased; if (opt_prof_accum) { tdata->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + tdata->cnt_summed.accumobjs_shifted_unbiased += + tctx->dump_cnts.accumobjs_shifted_unbiased; tdata->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + tdata->cnt_summed.accumbytes_unbiased += + tctx->dump_cnts.accumbytes_unbiased; } break; case prof_tctx_state_dumping: @@ -558,10 +675,17 @@ prof_tctx_merge_gctx(tsdn_t *tsdn, prof_tctx_t *tctx, prof_gctx_t *gctx) { malloc_mutex_assert_owner(tsdn, gctx->lock); gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs; + gctx->cnt_summed.curobjs_shifted_unbiased + += tctx->dump_cnts.curobjs_shifted_unbiased; gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes; + gctx->cnt_summed.curbytes_unbiased += tctx->dump_cnts.curbytes_unbiased; if (opt_prof_accum) { gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs; + gctx->cnt_summed.accumobjs_shifted_unbiased + += tctx->dump_cnts.accumobjs_shifted_unbiased; gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes; + gctx->cnt_summed.accumbytes_unbiased + += tctx->dump_cnts.accumbytes_unbiased; } } @@ -757,11 +881,19 @@ prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, } arg->cnt_all->curobjs += tdata->cnt_summed.curobjs; + arg->cnt_all->curobjs_shifted_unbiased + += tdata->cnt_summed.curobjs_shifted_unbiased; arg->cnt_all->curbytes += tdata->cnt_summed.curbytes; + arg->cnt_all->curbytes_unbiased + += tdata->cnt_summed.curbytes_unbiased; if (opt_prof_accum) { arg->cnt_all->accumobjs += tdata->cnt_summed.accumobjs; + arg->cnt_all->accumobjs_shifted_unbiased + += tdata->cnt_summed.accumobjs_shifted_unbiased; arg->cnt_all->accumbytes += tdata->cnt_summed.accumbytes; + arg->cnt_all->accumbytes_unbiased += + tdata->cnt_summed.accumbytes_unbiased; } } else { tdata->dumping = false; @@ -814,8 +946,16 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) { assert(gctx->cnt_summed.curobjs == 0); assert(gctx->cnt_summed.curbytes == 0); + /* + * These asserts would not be correct -- see the comment on races + * in prof.c + * assert(gctx->cnt_summed.curobjs_unbiased == 0); + * assert(gctx->cnt_summed.curbytes_unbiased == 0); + */ assert(gctx->cnt_summed.accumobjs == 0); + assert(gctx->cnt_summed.accumobjs_shifted_unbiased == 0); assert(gctx->cnt_summed.accumbytes == 0); + assert(gctx->cnt_summed.accumbytes_unbiased == 0); return; } @@ -834,7 +974,7 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx, } /* - * See prof_sample_threshold_update() comment for why the body of this function + * See prof_sample_new_event_wait() comment for why the body of this function * is conditionally compiled. */ static void @@ -1120,6 +1260,7 @@ prof_reset(tsd_t *tsd, size_t lg_sample) { malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx); lg_prof_sample = lg_sample; + prof_unbias_map_init(); next = NULL; do { @@ -1162,9 +1303,24 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) { assert(tctx->cnts.curobjs == 0); assert(tctx->cnts.curbytes == 0); + /* + * These asserts are not correct -- see the comment about races in + * prof.c + * + * assert(tctx->cnts.curobjs_shifted_unbiased == 0); + * assert(tctx->cnts.curbytes_unbiased == 0); + */ assert(!opt_prof_accum); assert(tctx->cnts.accumobjs == 0); assert(tctx->cnts.accumbytes == 0); + /* + * These ones are, since accumbyte counts never go down. Either + * prof_accum is off (in which case these should never have changed from + * their initial value of zero), or it's on (in which case we shouldn't + * be destroying this tctx). + */ + assert(tctx->cnts.accumobjs_shifted_unbiased == 0); + assert(tctx->cnts.accumbytes_unbiased == 0); prof_gctx_t *gctx = tctx->gctx;