diff --git a/include/jemalloc/internal/prof_externs.h b/include/jemalloc/internal/prof_externs.h
index 4579ab02..ba5933af 100644
--- a/include/jemalloc/internal/prof_externs.h
+++ b/include/jemalloc/internal/prof_externs.h
@@ -19,6 +19,7 @@ extern char opt_prof_prefix[
     PATH_MAX +
 #endif
     1];
+extern bool opt_prof_unbias;
 
 /* For recording recent allocations */
 extern ssize_t opt_prof_recent_alloc_max;
@@ -40,6 +41,9 @@ extern uint64_t prof_interval;
  * resets.
  */
 extern size_t lg_prof_sample;
+extern size_t prof_unbiased_sz[SC_NSIZES];
+extern size_t prof_shifted_unbiased_cnt[SC_NSIZES];
+void prof_unbias_map_init();
 
 extern bool prof_booted;
 
diff --git a/include/jemalloc/internal/prof_structs.h b/include/jemalloc/internal/prof_structs.h
index 26942aa6..fbad6145 100644
--- a/include/jemalloc/internal/prof_structs.h
+++ b/include/jemalloc/internal/prof_structs.h
@@ -24,9 +24,13 @@ typedef struct {
 struct prof_cnt_s {
 	/* Profiling counters. */
 	uint64_t	curobjs;
+	uint64_t	curobjs_shifted_unbiased;
 	uint64_t	curbytes;
+	uint64_t	curbytes_unbiased;
 	uint64_t	accumobjs;
+	uint64_t	accumobjs_shifted_unbiased;
 	uint64_t	accumbytes;
+	uint64_t	accumbytes_unbiased;
 };
 
 typedef enum {
diff --git a/src/jemalloc.c b/src/jemalloc.c
index f2e5f8eb..ae9ef3d1 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -1517,6 +1517,16 @@ malloc_conf_init_helper(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
 					}
 					CONF_CONTINUE;
 				}
+				/*
+				 * Undocumented.  When set to false, don't
+				 * correct for an unbiasing bug in jeprof
+				 * attribution.  This can be handy if you want
+				 * to get consistent numbers from your binary
+				 * across different jemalloc versions, even if
+				 * those numbers are incorrect.  The default is
+				 * true.
+				 */
+				CONF_HANDLE_BOOL(opt_prof_unbias, "prof_unbias")
 			}
 			if (config_log) {
 				if (CONF_MATCH("log")) {
diff --git a/src/prof.c b/src/prof.c
index 25735410..7b649e49 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -32,6 +32,7 @@ bool opt_prof_leak = false;
 bool opt_prof_accum = false;
 char opt_prof_prefix[PROF_DUMP_FILENAME_LEN];
 bool opt_prof_sys_thread_name = false;
+bool opt_prof_unbias = true;
 
 /* Accessed via prof_sample_event_handler(). */
 static counter_accum_t prof_idump_accumulated;
@@ -60,6 +61,8 @@ static malloc_mutex_t prof_gdump_mtx;
 uint64_t prof_interval = 0;
 
 size_t lg_prof_sample;
+size_t prof_unbiased_sz[SC_NSIZES];
+size_t prof_shifted_unbiased_cnt[SC_NSIZES];
 
 static uint64_t next_thr_uid;
 static malloc_mutex_t next_thr_uid_mtx;
@@ -69,6 +72,40 @@ bool prof_booted = false;
 
 /******************************************************************************/
 
+void prof_unbias_map_init() {
+	/* See the comment in prof_sample_new_event_wait */
+#ifdef JEMALLOC_PROF
+	for (szind_t i = 0; i < SC_NSIZES; i++) {
+		double sz = (double)sz_index2size(i);
+		double rate = (double)(ZU(1) << lg_prof_sample);
+		double div_val = 1.0 - exp(-sz / rate);
+		double unbiased_sz = sz / div_val;
+		/*
+		 * The "true" right value for the unbiased count is
+		 * 1.0/(1 - exp(-sz/rate)).  The problem is, we keep the counts
+		 * as integers (for a variety of reasons -- rounding errors
+		 * could trigger asserts, and not all libcs can properly handle
+		 * floating point arithmetic during malloc calls inside libc).
+		 * Rounding to an integer, though, can lead to rounding errors
+		 * of over 30% for sizes close to the sampling rate.  So
+		 * instead, we multiply by a constant, dividing the maximum
+		 * possible roundoff error by that constant.  To avoid overflow
+		 * in summing up size_t values, the largest safe constant we can
+		 * pick is the size of the smallest allocation.
+		 */
+		double cnt_shift = (double)(ZU(1) << SC_LG_TINY_MIN);
+		double shifted_unbiased_cnt = cnt_shift / div_val;
+		prof_unbiased_sz[i] = (size_t)round(unbiased_sz);
+		prof_shifted_unbiased_cnt[i] = (size_t)round(
+		    shifted_unbiased_cnt);
+	}
+#else
+	unreachable();
+#endif
+}
+
+/******************************************************************************/
+
 void
 prof_alloc_rollback(tsd_t *tsd, prof_tctx_t *tctx) {
 	cassert(config_prof);
@@ -96,12 +133,30 @@ prof_malloc_sample_object(tsd_t *tsd, const void *ptr, size_t size,
 	    ptr);
 	prof_info_set(tsd, edata, tctx);
 
+	szind_t szind = sz_size2index(size);
+
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
+	/*
+	 * We need to do these map lookups while holding the lock, to avoid the
+	 * possibility of races with prof_reset calls, which update the map and
+	 * then acquire the lock.  This actually still leaves a data race on the
+	 * contents of the unbias map, but we have not yet gone through and
+	 * atomic-ified the prof module, and compilers are not yet causing us
+	 * issues.  The key thing is to make sure that, if we read garbage data,
+	 * the prof_reset call is about to mark our tctx as expired before any
+	 * dumping of our corrupted output is attempted.
+	 */
+	size_t shifted_unbiased_cnt = prof_shifted_unbiased_cnt[szind];
+	size_t unbiased_bytes = prof_unbiased_sz[szind];
 	tctx->cnts.curobjs++;
+	tctx->cnts.curobjs_shifted_unbiased += shifted_unbiased_cnt;
 	tctx->cnts.curbytes += usize;
+	tctx->cnts.curbytes_unbiased += unbiased_bytes;
 	if (opt_prof_accum) {
 		tctx->cnts.accumobjs++;
+		tctx->cnts.accumobjs_shifted_unbiased += shifted_unbiased_cnt;
 		tctx->cnts.accumbytes += usize;
+		tctx->cnts.accumbytes_unbiased += unbiased_bytes;
 	}
 	bool record_recent = prof_recent_alloc_prepare(tsd, tctx);
 	tctx->prepared = false;
@@ -118,12 +173,21 @@ prof_free_sampled_object(tsd_t *tsd, size_t usize, prof_info_t *prof_info) {
 	prof_tctx_t *tctx = prof_info->alloc_tctx;
 	assert((uintptr_t)tctx > (uintptr_t)1U);
 
+	szind_t szind = sz_size2index(usize);
 	malloc_mutex_lock(tsd_tsdn(tsd), tctx->tdata->lock);
 
 	assert(tctx->cnts.curobjs > 0);
 	assert(tctx->cnts.curbytes >= usize);
+	/*
+	 * It's not correct to do equivalent asserts for unbiased bytes, because
+	 * of the potential for races with prof.reset calls.  The map contents
+	 * should really be atomic, but we have not atomic-ified the prof module
+	 * yet.
+	 */
 	tctx->cnts.curobjs--;
+	tctx->cnts.curobjs_shifted_unbiased -= prof_shifted_unbiased_cnt[szind];
 	tctx->cnts.curbytes -= usize;
+	tctx->cnts.curbytes_unbiased -= prof_unbiased_sz[szind];
 
 	prof_try_log(tsd, usize, prof_info);
 
@@ -517,6 +581,7 @@ prof_boot2(tsd_t *tsd, base_t *base) {
 		unsigned i;
 
 		lg_prof_sample = opt_lg_prof_sample;
+		prof_unbias_map_init();
 
 		prof_active = opt_prof_active;
 		if (malloc_mutex_init(&prof_active_mtx, "prof_active",
diff --git a/src/prof_data.c b/src/prof_data.c
index 6b441de1..ae9cd4b1 100644
--- a/src/prof_data.c
+++ b/src/prof_data.c
@@ -514,12 +514,121 @@ prof_dump_printf(write_cb_t *prof_dump_write, void *cbopaque,
 	prof_dump_write(cbopaque, buf);
 }
 
+/*
+ * Casting a double to a uint64_t may not necessarily be in range; this can be
+ * UB.  I don't think this is practically possible with the cur counters, but
+ * plausibly could be with the accum counters.
+ */
+#ifdef JEMALLOC_PROF
+static uint64_t
+prof_double_uint64_cast(double d) {
+	/*
+	 * Note: UINT64_MAX + 1 is exactly representable as a double on all
+	 * reasonable platforms (certainly those we'll support).  Writing this
+	 * as !(a < b) instead of (a >= b) means that we're NaN-safe.
+	 */
+	double rounded = round(d);
+	if (!(rounded < (double)UINT64_MAX)) {
+		return UINT64_MAX;
+	}
+	return (uint64_t)rounded;
+}
+#endif
+
+/*
+ * The unbiasing story is long.  The jeprof unbiasing logic was copied from
+ * pprof.  Both shared an issue: they unbiased using the average size of the
+ * allocations at a particular stack trace.  This can work out OK if allocations
+ * are mostly of the same size given some stack, but not otherwise.  We now
+ * internally track what the unbiased results ought to be.  We can't just report
+ * them as they are though; they'll still go through the jeprof unbiasing
+ * process.  Instead, we figure out what values we can feed *into* jeprof's
+ * unbiasing mechanism that will lead to getting the right values out.
+ *
+ * It'll unbias count and aggregate size as:
+ *
+ *   c_out = c_in * 1/(1-exp(-s_in/c_in/R)
+ *   s_out = s_in * 1/(1-exp(-s_in/c_in/R)
+ *
+ * We want to solve for the values of c_in and s_in that will
+ * give the c_out and s_out that we've computed internally.
+ *
+ * Let's do a change of variables (both to make the math easier and to make it
+ * easier to write):
+ *   x = s_in / c_in
+ *   y = s_in
+ *   k = 1/R.
+ *
+ * Then
+ *   c_out = y/x * 1/(1-exp(-k*x))
+ *   s_out = y * 1/(1-exp(-k*x))
+ *
+ * The first equation gives:
+ *   y = x * c_out * (1-exp(-k*x))
+ * The second gives:
+ *   y = s_out * (1-exp(-k*x))
+ * So we have
+ *   x = s_out / c_out.
+ * And all the other values fall out from that.
+ *
+ * This is all a fair bit of work.  The thing we get out of it is that we don't
+ * break backwards compatibility with jeprof (and the various tools that have
+ * copied its unbiasing logic).  Eventually, we anticipate a v3 heap profile
+ * dump format based on JSON, at which point I think much of this logic can get
+ * cleaned up (since we'll be taking a compatibility break there anyways).
+ */
+static void
+prof_do_unbias(uint64_t c_out_shifted_i, uint64_t s_out_i, uint64_t *r_c_in,
+    uint64_t *r_s_in) {
+#ifdef JEMALLOC_PROF
+	if (c_out_shifted_i == 0 || s_out_i == 0) {
+		*r_c_in = 0;
+		*r_s_in = 0;
+		return;
+	}
+	/*
+	 * See the note in prof_unbias_map_init() to see why we take c_out in a
+	 * shifted form.
+	 */
+	double c_out = (double)c_out_shifted_i
+	    / (double)(ZU(1) << SC_LG_TINY_MIN);
+	double s_out = (double)s_out_i;
+	double R = (double)(ZU(1) << lg_prof_sample);
+
+	double x = s_out / c_out;
+	double y = s_out * (1.0 - exp(-x / R));
+
+	double c_in = y / x;
+	double s_in = y;
+
+	*r_c_in = prof_double_uint64_cast(c_in);
+	*r_s_in = prof_double_uint64_cast(s_in);
+#else
+	unreachable();
+#endif
+}
+
 static void
 prof_dump_print_cnts(write_cb_t *prof_dump_write, void *cbopaque,
     const prof_cnt_t *cnts) {
+	uint64_t curobjs;
+	uint64_t curbytes;
+	uint64_t accumobjs;
+	uint64_t accumbytes;
+	if (opt_prof_unbias) {
+		prof_do_unbias(cnts->curobjs_shifted_unbiased,
+		    cnts->curbytes_unbiased, &curobjs, &curbytes);
+		prof_do_unbias(cnts->accumobjs_shifted_unbiased,
+		    cnts->accumbytes_unbiased, &accumobjs, &accumbytes);
+	} else {
+		curobjs = cnts->curobjs;
+		curbytes = cnts->curbytes;
+		accumobjs = cnts->accumobjs;
+		accumbytes = cnts->accumbytes;
+	}
 	prof_dump_printf(prof_dump_write, cbopaque,
 	    "%"FMTu64": %"FMTu64" [%"FMTu64": %"FMTu64"]",
-	    cnts->curobjs, cnts->curbytes, cnts->accumobjs, cnts->accumbytes);
+	    curobjs, curbytes, accumobjs, accumbytes);
 }
 
 static void
@@ -539,12 +648,20 @@ prof_tctx_merge_tdata(tsdn_t *tsdn, prof_tctx_t *tctx, prof_tdata_t *tdata) {
 		memcpy(&tctx->dump_cnts, &tctx->cnts, sizeof(prof_cnt_t));
 
 		tdata->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+		tdata->cnt_summed.curobjs_shifted_unbiased
+		    += tctx->dump_cnts.curobjs_shifted_unbiased;
 		tdata->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+		tdata->cnt_summed.curbytes_unbiased
+		    += tctx->dump_cnts.curbytes_unbiased;
 		if (opt_prof_accum) {
 			tdata->cnt_summed.accumobjs +=
 			    tctx->dump_cnts.accumobjs;
+			tdata->cnt_summed.accumobjs_shifted_unbiased +=
+			    tctx->dump_cnts.accumobjs_shifted_unbiased;
 			tdata->cnt_summed.accumbytes +=
 			    tctx->dump_cnts.accumbytes;
+			tdata->cnt_summed.accumbytes_unbiased +=
+			    tctx->dump_cnts.accumbytes_unbiased;
 		}
 		break;
 	case prof_tctx_state_dumping:
@@ -558,10 +675,17 @@ prof_tctx_merge_gctx(tsdn_t *tsdn, prof_tctx_t *tctx, prof_gctx_t *gctx) {
 	malloc_mutex_assert_owner(tsdn, gctx->lock);
 
 	gctx->cnt_summed.curobjs += tctx->dump_cnts.curobjs;
+	gctx->cnt_summed.curobjs_shifted_unbiased
+	    += tctx->dump_cnts.curobjs_shifted_unbiased;
 	gctx->cnt_summed.curbytes += tctx->dump_cnts.curbytes;
+	gctx->cnt_summed.curbytes_unbiased += tctx->dump_cnts.curbytes_unbiased;
 	if (opt_prof_accum) {
 		gctx->cnt_summed.accumobjs += tctx->dump_cnts.accumobjs;
+		gctx->cnt_summed.accumobjs_shifted_unbiased
+		    += tctx->dump_cnts.accumobjs_shifted_unbiased;
 		gctx->cnt_summed.accumbytes += tctx->dump_cnts.accumbytes;
+		gctx->cnt_summed.accumbytes_unbiased
+		    += tctx->dump_cnts.accumbytes_unbiased;
 	}
 }
 
@@ -757,11 +881,19 @@ prof_tdata_merge_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata,
 		}
 
 		arg->cnt_all->curobjs += tdata->cnt_summed.curobjs;
+		arg->cnt_all->curobjs_shifted_unbiased
+		    += tdata->cnt_summed.curobjs_shifted_unbiased;
 		arg->cnt_all->curbytes += tdata->cnt_summed.curbytes;
+		arg->cnt_all->curbytes_unbiased
+		    += tdata->cnt_summed.curbytes_unbiased;
 		if (opt_prof_accum) {
 			arg->cnt_all->accumobjs += tdata->cnt_summed.accumobjs;
+			arg->cnt_all->accumobjs_shifted_unbiased
+			    += tdata->cnt_summed.accumobjs_shifted_unbiased;
 			arg->cnt_all->accumbytes +=
 			    tdata->cnt_summed.accumbytes;
+			arg->cnt_all->accumbytes_unbiased +=
+			    tdata->cnt_summed.accumbytes_unbiased;
 		}
 	} else {
 		tdata->dumping = false;
@@ -814,8 +946,16 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx,
 	    (opt_prof_accum && gctx->cnt_summed.accumobjs == 0)) {
 		assert(gctx->cnt_summed.curobjs == 0);
 		assert(gctx->cnt_summed.curbytes == 0);
+		/*
+		 * These asserts would not be correct -- see the comment on races
+		 * in prof.c
+		 * assert(gctx->cnt_summed.curobjs_unbiased == 0);
+		 * assert(gctx->cnt_summed.curbytes_unbiased == 0);
+		*/
 		assert(gctx->cnt_summed.accumobjs == 0);
+		assert(gctx->cnt_summed.accumobjs_shifted_unbiased == 0);
 		assert(gctx->cnt_summed.accumbytes == 0);
+		assert(gctx->cnt_summed.accumbytes_unbiased == 0);
 		return;
 	}
 
@@ -834,7 +974,7 @@ prof_dump_gctx(prof_dump_iter_arg_t *arg, prof_gctx_t *gctx,
 }
 
 /*
- * See prof_sample_threshold_update() comment for why the body of this function
+ * See prof_sample_new_event_wait() comment for why the body of this function
  * is conditionally compiled.
  */
 static void
@@ -1120,6 +1260,7 @@ prof_reset(tsd_t *tsd, size_t lg_sample) {
 	malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx);
 
 	lg_prof_sample = lg_sample;
+	prof_unbias_map_init();
 
 	next = NULL;
 	do {
@@ -1162,9 +1303,24 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx) {
 
 	assert(tctx->cnts.curobjs == 0);
 	assert(tctx->cnts.curbytes == 0);
+	/*
+	 * These asserts are not correct -- see the comment about races in
+	 * prof.c
+	 *
+	 * assert(tctx->cnts.curobjs_shifted_unbiased == 0);
+	 * assert(tctx->cnts.curbytes_unbiased == 0);
+	 */
 	assert(!opt_prof_accum);
 	assert(tctx->cnts.accumobjs == 0);
 	assert(tctx->cnts.accumbytes == 0);
+	/*
+	 * These ones are, since accumbyte counts never go down.  Either
+	 * prof_accum is off (in which case these should never have changed from
+	 * their initial value of zero), or it's on (in which case we shouldn't
+	 * be destroying this tctx).
+	 */
+	assert(tctx->cnts.accumobjs_shifted_unbiased == 0);
+	assert(tctx->cnts.accumbytes_unbiased == 0);
 
 	prof_gctx_t *gctx = tctx->gctx;