diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 00000000..ddd5c571
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,28 @@
+version: '{build}'
+
+environment:
+  matrix:
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    MSVC: amd64
+  - MSYSTEM: MINGW32
+    CPU: i686
+    MSVC: x86
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+  - MSYSTEM: MINGW32
+    CPU: i686
+
+install:
+  - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH%
+  - if defined MSVC call "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %MSVC%
+  - if defined MSVC pacman --noconfirm -Rsc mingw-w64-%CPU%-gcc gcc
+  - pacman --noconfirm -Suy mingw-w64-%CPU%-make
+
+build_script:
+  - bash -c "autoconf"
+  - bash -c "./configure"
+  - mingw32-make -j3
+  - file lib/jemalloc.dll
+  - mingw32-make -j3 tests
+  - mingw32-make -k check
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 00000000..1fed4f8e
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,29 @@
+language: c
+
+matrix:
+  include:
+    - os: linux
+      compiler: gcc
+    - os: linux
+      compiler: gcc
+      env:
+        - EXTRA_FLAGS=-m32
+      addons:
+        apt:
+          packages:
+          - gcc-multilib
+    - os: osx
+      compiler: clang
+    - os: osx
+      compiler: clang
+      env:
+        - EXTRA_FLAGS=-m32
+
+before_script:
+  - autoconf
+  - ./configure${EXTRA_FLAGS:+ CC="$CC $EXTRA_FLAGS"}
+  - make -j3
+  - make -j3 tests
+
+script:
+  - make check
diff --git a/ChangeLog b/ChangeLog
index ed62e0e7..ac2e4d3c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -4,6 +4,49 @@ brevity.  Much more detail can be found in the git revision history:
 
     https://github.com/jemalloc/jemalloc
 
+* 4.3.0 (November 4, 2016)
+
+  This is the first release that passes the test suite for multiple Windows
+  configurations, thanks in large part to @glandium setting up continuous
+  integration via AppVeyor (and Travis CI for Linux and OS X).
+
+  New features:
+  - Add "J" (JSON) support to malloc_stats_print().  (@jasone)
+  - Add Cray compiler support.  (@ronawho)
+
+  Optimizations:
+  - Add/use adaptive spinning for bootstrapping and radix tree node
+    initialization.  (@jasone)
+
+  Bug fixes:
+  - Fix large allocation to search starting in the optimal size class heap,
+    which can substantially reduce virtual memory churn and fragmentation.  This
+    regression was first released in 4.0.0.  (@mjp41, @jasone)
+  - Fix stats.arenas.<i>.nthreads accounting.  (@interwq)
+  - Fix and simplify decay-based purging.  (@jasone)
+  - Make DSS (sbrk(2)-related) operations lockless, which resolves potential
+    deadlocks during thread exit.  (@jasone)
+  - Fix over-sized allocation of radix tree leaf nodes.  (@mjp41, @ogaun,
+    @jasone)
+  - Fix EXTRA_CFLAGS to not affect configuration.  (@jasone)
+  - Fix a Valgrind integration bug.  (@ronawho)
+  - Disallow 0x5a junk filling when running in Valgrind.  (@jasone)
+  - Fix a file descriptor leak on Linux.  This regression was first released in
+    4.2.0.  (@vsarunas, @jasone)
+  - Fix static linking of jemalloc with glibc.  (@djwatson)
+  - Use syscall(2) rather than {open,read,close}(2) during boot on Linux.  This
+    works around other libraries' system call wrappers performing reentrant
+    allocation.  (@kspinka, @Whissi, @jasone)
+  - Fix OS X default zone replacement to work with OS X 10.12.  (@glandium,
+    @jasone)
+  - Fix cached memory management to avoid needless commit/decommit operations
+    during purging, which resolves permanent virtual memory map fragmentation
+    issues on Windows.  (@mjp41, @jasone)
+  - Fix TSD fetches to avoid (recursive) allocation.  This is relevant to
+    non-TLS and Windows configurations.  (@jasone)
+  - Fix malloc_conf overriding to work on Windows.  (@jasone)
+  - Forcibly disable lazy-lock on Windows (was forcibly *enabled*).  (@jasone)
+
 * 4.2.1 (June 8, 2016)
 
   Bug fixes:
@@ -19,7 +62,7 @@ brevity.  Much more detail can be found in the git revision history:
 
   New features:
   - Add the arena.<i>.reset mallctl, which makes it possible to discard all of
-    an arena's allocations in a single operation.  (@jasone@)
+    an arena's allocations in a single operation.  (@jasone)
   - Add the stats.retained and stats.arenas.<i>.retained statistics.  (@jasone)
   - Add the --with-version configure option.  (@jasone)
   - Support --with-lg-page values larger than actual page size.  (@jasone)
diff --git a/Makefile.in b/Makefile.in
index 652f01f2..d13c7f10 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -24,7 +24,8 @@ abs_objroot := @abs_objroot@
 
 # Build parameters.
 CPPFLAGS := @CPPFLAGS@ -I$(srcroot)include -I$(objroot)include
-CFLAGS := @CFLAGS@
+EXTRA_CFLAGS := @EXTRA_CFLAGS@
+CFLAGS := @CFLAGS@ $(EXTRA_CFLAGS)
 LDFLAGS := @LDFLAGS@
 EXTRA_LDFLAGS := @EXTRA_LDFLAGS@
 LIBS := @LIBS@
@@ -52,15 +53,19 @@ enable_prof := @enable_prof@
 enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
 MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
+link_whole_archive := @link_whole_archive@
 DSO_LDFLAGS = @DSO_LDFLAGS@
 SOREV = @SOREV@
 PIC_CFLAGS = @PIC_CFLAGS@
 CTARGET = @CTARGET@
 LDTARGET = @LDTARGET@
+TEST_LD_MODE = @TEST_LD_MODE@
 MKLIB = @MKLIB@
 AR = @AR@
 ARFLAGS = @ARFLAGS@
 CC_MM = @CC_MM@
+LM := @LM@
+INSTALL = @INSTALL@
 
 ifeq (macho, $(ABI))
 TEST_LIBRARY_PATH := DYLD_FALLBACK_LIBRARY_PATH="$(objroot)lib"
@@ -99,6 +104,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/quarantine.c \
 	$(srcroot)src/rtree.c \
 	$(srcroot)src/stats.c \
+	$(srcroot)src/spin.c \
 	$(srcroot)src/tcache.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
@@ -122,6 +128,11 @@ DSOS := $(objroot)lib/$(LIBJEMALLOC).$(SOREV)
 ifneq ($(SOREV),$(SO))
 DSOS += $(objroot)lib/$(LIBJEMALLOC).$(SO)
 endif
+ifeq (1, $(link_whole_archive))
+LJEMALLOC := -Wl,--whole-archive -L$(objroot)lib -l$(LIBJEMALLOC) -Wl,--no-whole-archive
+else
+LJEMALLOC := $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+endif
 PC := $(objroot)jemalloc.pc
 MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
@@ -133,7 +144,11 @@ C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
 	$(srcroot)test/src/mtx.c $(srcroot)test/src/mq.c \
 	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
 	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
+ifeq (1, $(link_whole_archive))
+C_UTIL_INTEGRATION_SRCS :=
+else
 C_UTIL_INTEGRATION_SRCS := $(srcroot)src/nstime.c $(srcroot)src/util.c
+endif
 TESTS_UNIT := \
 	$(srcroot)test/unit/a0.c \
 	$(srcroot)test/unit/arena_reset.c \
@@ -295,69 +310,69 @@ $(STATIC_LIBS):
 
 $(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(TESTS_UNIT_LINK_OBJS) $(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS)
 	@mkdir -p $(@D)
-	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
 
 $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
-	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(filter -lpthread,$(LIBS))) -lm $(EXTRA_LDFLAGS)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -lpthread,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
 
 $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
-	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
 
 build_lib_shared: $(DSOS)
 build_lib_static: $(STATIC_LIBS)
 build_lib: build_lib_shared build_lib_static
 
 install_bin:
-	install -d $(BINDIR)
+	$(INSTALL) -d $(BINDIR)
 	@for b in $(BINS); do \
-	echo "install -m 755 $$b $(BINDIR)"; \
-	install -m 755 $$b $(BINDIR); \
+	echo "$(INSTALL) -m 755 $$b $(BINDIR)"; \
+	$(INSTALL) -m 755 $$b $(BINDIR); \
 done
 
 install_include:
-	install -d $(INCLUDEDIR)/jemalloc
+	$(INSTALL) -d $(INCLUDEDIR)/jemalloc
 	@for h in $(C_HDRS); do \
-	echo "install -m 644 $$h $(INCLUDEDIR)/jemalloc"; \
-	install -m 644 $$h $(INCLUDEDIR)/jemalloc; \
+	echo "$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc"; \
+	$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc; \
 done
 
 install_lib_shared: $(DSOS)
-	install -d $(LIBDIR)
-	install -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
+	$(INSTALL) -d $(LIBDIR)
+	$(INSTALL) -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
 ifneq ($(SOREV),$(SO))
 	ln -sf $(LIBJEMALLOC).$(SOREV) $(LIBDIR)/$(LIBJEMALLOC).$(SO)
 endif
 
 install_lib_static: $(STATIC_LIBS)
-	install -d $(LIBDIR)
+	$(INSTALL) -d $(LIBDIR)
 	@for l in $(STATIC_LIBS); do \
-	echo "install -m 755 $$l $(LIBDIR)"; \
-	install -m 755 $$l $(LIBDIR); \
+	echo "$(INSTALL) -m 755 $$l $(LIBDIR)"; \
+	$(INSTALL) -m 755 $$l $(LIBDIR); \
 done
 
 install_lib_pc: $(PC)
-	install -d $(LIBDIR)/pkgconfig
+	$(INSTALL) -d $(LIBDIR)/pkgconfig
 	@for l in $(PC); do \
-	echo "install -m 644 $$l $(LIBDIR)/pkgconfig"; \
-	install -m 644 $$l $(LIBDIR)/pkgconfig; \
+	echo "$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done
 
 install_lib: install_lib_shared install_lib_static install_lib_pc
 
 install_doc_html:
-	install -d $(DATADIR)/doc/jemalloc$(install_suffix)
+	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
 	@for d in $(DOCS_HTML); do \
-	echo "install -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix)"; \
-	install -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
+	echo "$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix)"; \
+	$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
 done
 
 install_doc_man:
-	install -d $(MANDIR)/man3
+	$(INSTALL) -d $(MANDIR)/man3
 	@for d in $(DOCS_MAN3); do \
-	echo "install -m 644 $$d $(MANDIR)/man3"; \
-	install -m 644 $$d $(MANDIR)/man3; \
+	echo "$(INSTALL) -m 644 $$d $(MANDIR)/man3"; \
+	$(INSTALL) -m 644 $$d $(MANDIR)/man3; \
 done
 
 install_doc: install_doc_html install_doc_man
diff --git a/README b/README
index 9b268f42..5ff24a9e 100644
--- a/README
+++ b/README
@@ -17,4 +17,4 @@ jemalloc.
 
 The ChangeLog file contains a brief summary of changes for each release.
 
-URL: http://www.canonware.com/jemalloc/
+URL: http://jemalloc.net/
diff --git a/configure.ac b/configure.ac
index 7f19715d..104fd994 100644
--- a/configure.ac
+++ b/configure.ac
@@ -118,6 +118,7 @@ dnl If CFLAGS isn't defined, set CFLAGS to something reasonable.  Otherwise,
 dnl just prevent autoconf from molesting CFLAGS.
 CFLAGS=$CFLAGS
 AC_PROG_CC
+
 if test "x$GCC" != "xyes" ; then
   AC_CACHE_CHECK([whether compiler is MSVC],
                  [je_cv_msvc],
@@ -131,12 +132,54 @@ if test "x$GCC" != "xyes" ; then
                                [je_cv_msvc=no])])
 fi
 
+dnl check if a cray prgenv wrapper compiler is being used
+je_cv_cray_prgenv_wrapper=""
+if test "x${PE_ENV}" != "x" ; then
+  case "${CC}" in
+    CC|cc)
+	je_cv_cray_prgenv_wrapper="yes"
+	;;
+    *)
+       ;;
+  esac
+fi
+
+AC_CACHE_CHECK([whether compiler is cray],
+              [je_cv_cray],
+              [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
+                                                  [
+#ifndef _CRAYC
+  int fail[-1];
+#endif
+])],
+                            [je_cv_cray=yes],
+                            [je_cv_cray=no])])
+
+if test "x${je_cv_cray}" = "xyes" ; then
+  AC_CACHE_CHECK([whether cray compiler version is 8.4],
+                [je_cv_cray_84],
+                [AC_COMPILE_IFELSE([AC_LANG_PROGRAM([],
+                                                      [
+#if !(_RELEASE_MAJOR == 8 && _RELEASE_MINOR == 4)
+  int fail[-1];
+#endif
+])],
+                              [je_cv_cray_84=yes],
+                              [je_cv_cray_84=no])])
+fi
+
 if test "x$CFLAGS" = "x" ; then
   no_CFLAGS="yes"
   if test "x$GCC" = "xyes" ; then
-    JE_CFLAGS_APPEND([-std=gnu99])
-    if test "x$je_cv_cflags_appended" = "x-std=gnu99" ; then
+dnl    JE_CFLAGS_APPEND([-std=gnu99])
+    JE_CFLAGS_APPEND([-std=gnu11])
+    if test "x$je_cv_cflags_appended" = "x-std=gnu11" ; then
       AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT])
+    else
+      JE_CFLAGS_APPEND([-std=gnu99])
+      if test "x$je_cv_cflags_appended" = "x-std=gnu99" ; then
+        AC_DEFINE_UNQUOTED([JEMALLOC_HAS_RESTRICT])
+      fi
     fi
     JE_CFLAGS_APPEND([-Wall])
     JE_CFLAGS_APPEND([-Werror=declaration-after-statement])
@@ -152,11 +195,21 @@ if test "x$CFLAGS" = "x" ; then
     JE_CFLAGS_APPEND([-FS])
     CPPFLAGS="$CPPFLAGS -I${srcdir}/include/msvc_compat"
   fi
+  if test "x$je_cv_cray" = "xyes" ; then
+    dnl cray compiler 8.4 has an inlining bug
+    if test "x$je_cv_cray_84" = "xyes" ; then
+      JE_CFLAGS_APPEND([-hipa2])
+      JE_CFLAGS_APPEND([-hnognu])
+    fi
+    if test "x$enable_cc_silence" != "xno" ; then
+      dnl ignore unreachable code warning
+      JE_CFLAGS_APPEND([-hnomessage=128])
+      dnl ignore redefinition of "malloc", "free", etc warning
+      JE_CFLAGS_APPEND([-hnomessage=1357])
+    fi
+  fi
 fi
-dnl Append EXTRA_CFLAGS to CFLAGS, if defined.
-if test "x$EXTRA_CFLAGS" != "x" ; then
-  JE_CFLAGS_APPEND([$EXTRA_CFLAGS])
-fi
+AC_SUBST([EXTRA_CFLAGS])
 AC_PROG_CPP
 
 AC_C_BIGENDIAN([ac_cv_big_endian=1], [ac_cv_big_endian=0])
@@ -263,17 +316,27 @@ o="$ac_objext"
 a="a"
 exe="$ac_exeext"
 libprefix="lib"
+link_whole_archive="0"
 DSO_LDFLAGS='-shared -Wl,-soname,$(@F)'
 RPATH='-Wl,-rpath,$(1)'
 SOREV="${so}.${rev}"
 PIC_CFLAGS='-fPIC -DPIC'
 CTARGET='-o $@'
 LDTARGET='-o $@'
+TEST_LD_MODE=
 EXTRA_LDFLAGS=
 ARFLAGS='crus'
 AROUT=' $@'
 CC_MM=1
 
+if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then
+  TEST_LD_MODE='-dynamic'
+fi
+
+if test "x${je_cv_cray}" = "xyes" ; then
+  CC_MM=
+fi
+
 AN_MAKEVAR([AR], [AC_PROG_AR])
 AN_PROGRAM([ar], [AC_PROG_AR])
 AC_DEFUN([AC_PROG_AR], [AC_CHECK_TOOL(AR, ar, :)])
@@ -286,11 +349,11 @@ dnl
 dnl Define cpp macros in CPPFLAGS, rather than doing AC_DEFINE(macro), since the
 dnl definitions need to be seen before any headers are included, which is a pain
 dnl to make happen otherwise.
+CFLAGS="$CFLAGS"
 default_munmap="1"
 maps_coalesce="1"
 case "${host}" in
   *-*-darwin* | *-*-ios*)
-	CFLAGS="$CFLAGS"
 	abi="macho"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	RPATH=""
@@ -303,30 +366,26 @@ case "${host}" in
 	sbrk_deprecated="1"
 	;;
   *-*-freebsd*)
-	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_SYSCTL_VM_OVERCOMMIT], [ ])
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	force_lazy_lock="1"
 	;;
   *-*-dragonfly*)
-	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
   *-*-openbsd*)
-	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	force_tls="0"
 	;;
   *-*-bitrig*)
-	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
   *-*-linux*)
-	CFLAGS="$CFLAGS"
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
 	CPPFLAGS="$CPPFLAGS -D_GNU_SOURCE"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H])
@@ -345,13 +404,12 @@ case "${host}" in
 #error aout
 #endif
 ]])],
-                          [CFLAGS="$CFLAGS"; abi="elf"],
+                          [abi="elf"],
                           [abi="aout"])
 	AC_MSG_RESULT([$abi])
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	;;
   *-*-solaris2*)
-	CFLAGS="$CFLAGS"
 	abi="elf"
 	AC_DEFINE([JEMALLOC_PURGE_MADVISE_FREE], [ ])
 	RPATH='-Wl,-R,$(1)'
@@ -372,7 +430,6 @@ case "${host}" in
   *-*-mingw* | *-*-cygwin*)
 	abi="pecoff"
 	force_tls="0"
-	force_lazy_lock="1"
 	maps_coalesce="0"
 	RPATH=""
 	so="dll"
@@ -389,6 +446,7 @@ case "${host}" in
         else
 	  importlib="${so}"
 	  DSO_LDFLAGS="-shared"
+	  link_whole_archive="1"
 	fi
 	a="lib"
 	libprefix=""
@@ -426,17 +484,28 @@ AC_SUBST([o])
 AC_SUBST([a])
 AC_SUBST([exe])
 AC_SUBST([libprefix])
+AC_SUBST([link_whole_archive])
 AC_SUBST([DSO_LDFLAGS])
 AC_SUBST([EXTRA_LDFLAGS])
 AC_SUBST([SOREV])
 AC_SUBST([PIC_CFLAGS])
 AC_SUBST([CTARGET])
 AC_SUBST([LDTARGET])
+AC_SUBST([TEST_LD_MODE])
 AC_SUBST([MKLIB])
 AC_SUBST([ARFLAGS])
 AC_SUBST([AROUT])
 AC_SUBST([CC_MM])
 
+dnl Determine whether libm must be linked to use e.g. log(3).
+AC_SEARCH_LIBS([log], [m], , [AC_MSG_ERROR([Missing math functions])])
+if test "x$ac_cv_search_log" != "xnone required" ; then
+  LM="$ac_cv_search_log"
+else
+  LM=
+fi
+AC_SUBST(LM)
+
 JE_COMPILABLE([__attribute__ syntax],
               [static __attribute__((unused)) void foo(void){}],
               [],
@@ -450,6 +519,7 @@ fi
 dnl Check for tls_model attribute support (clang 3.0 still lacks support).
 SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
+JE_CFLAGS_APPEND([-herror_on_warning])
 JE_COMPILABLE([tls_model attribute], [],
               [static __thread int
                __attribute__((tls_model("initial-exec"), unused)) foo;
@@ -465,6 +535,7 @@ fi
 dnl Check for alloc_size attribute support.
 SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
+JE_CFLAGS_APPEND([-herror_on_warning])
 JE_COMPILABLE([alloc_size attribute], [#include <stdlib.h>],
               [void *foo(size_t size) __attribute__((alloc_size(1)));],
               [je_cv_alloc_size])
@@ -475,6 +546,7 @@ fi
 dnl Check for format(gnu_printf, ...) attribute support.
 SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
+JE_CFLAGS_APPEND([-herror_on_warning])
 JE_COMPILABLE([format(gnu_printf, ...) attribute], [#include <stdlib.h>],
               [void *foo(const char *format, ...) __attribute__((format(gnu_printf, 1, 2)));],
               [je_cv_format_gnu_printf])
@@ -485,6 +557,7 @@ fi
 dnl Check for format(printf, ...) attribute support.
 SAVED_CFLAGS="${CFLAGS}"
 JE_CFLAGS_APPEND([-Werror])
+JE_CFLAGS_APPEND([-herror_on_warning])
 JE_COMPILABLE([format(printf, ...) attribute], [#include <stdlib.h>],
               [void *foo(const char *format, ...) __attribute__((format(printf, 1, 2)));],
               [je_cv_format_printf])
@@ -879,9 +952,9 @@ fi
 AC_MSG_CHECKING([configured backtracing method])
 AC_MSG_RESULT([$backtrace_method])
 if test "x$enable_prof" = "x1" ; then
-  if test "x$abi" != "xpecoff"; then
-    dnl Heap profiling uses the log(3) function.
-    LIBS="$LIBS -lm"
+  dnl Heap profiling uses the log(3) function.
+  if test "x$LM" != "x" ; then
+    LIBS="$LIBS $LM"
   fi
 
   AC_DEFINE([JEMALLOC_PROF], [ ])
@@ -1050,6 +1123,23 @@ if test "x$enable_cache_oblivious" = "x1" ; then
 fi
 AC_SUBST([enable_cache_oblivious])
 
+
+
+JE_COMPILABLE([a program using __builtin_unreachable], [
+void foo (void) {
+  __builtin_unreachable();
+}
+], [
+	{
+		foo();
+	}
+], [je_cv_gcc_builtin_unreachable])
+if test "x${je_cv_gcc_builtin_unreachable}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [__builtin_unreachable])
+else
+  AC_DEFINE([JEMALLOC_INTERNAL_UNREACHABLE], [abort])
+fi
+
 dnl ============================================================================
 dnl Check for  __builtin_ffsl(), then ffsl(3), and fail if neither are found.
 dnl One of those two functions should (theoretically) exist on all platforms
@@ -1244,6 +1334,74 @@ CPPFLAGS="$CPPFLAGS -D_REENTRANT"
 dnl Check whether clock_gettime(2) is in libc or librt.
 AC_SEARCH_LIBS([clock_gettime], [rt])
 
+dnl Cray wrapper compiler often adds `-lrt` when using `-static`. Check with
+dnl `-dynamic` as well in case a user tries to dynamically link in jemalloc
+if test "x$je_cv_cray_prgenv_wrapper" = "xyes" ; then
+  if test "$ac_cv_search_clock_gettime" != "-lrt"; then
+    SAVED_CFLAGS="${CFLAGS}"
+
+    unset ac_cv_search_clock_gettime
+    JE_CFLAGS_APPEND([-dynamic])
+    AC_SEARCH_LIBS([clock_gettime], [rt])
+
+    CFLAGS="${SAVED_CFLAGS}"
+  fi
+fi
+
+dnl check for CLOCK_MONOTONIC_COARSE (Linux-specific).
+JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC_COARSE, ...)], [
+#include <time.h>
+], [
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+], [je_cv_clock_monotonic_coarse])
+if test "x${je_cv_clock_monotonic_coarse}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE])
+fi
+
+dnl check for CLOCK_MONOTONIC.
+JE_COMPILABLE([clock_gettime(CLOCK_MONOTONIC, ...)], [
+#include <unistd.h>
+#include <time.h>
+], [
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+#if !defined(_POSIX_MONOTONIC_CLOCK) || _POSIX_MONOTONIC_CLOCK < 0
+#  error _POSIX_MONOTONIC_CLOCK missing/invalid
+#endif
+], [je_cv_clock_monotonic])
+if test "x${je_cv_clock_monotonic}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_MONOTONIC])
+fi
+
+dnl Check for mach_absolute_time().
+JE_COMPILABLE([mach_absolute_time()], [
+#include <mach/mach_time.h>
+], [
+	mach_absolute_time();
+], [je_cv_mach_absolute_time])
+if test "x${je_cv_mach_absolute_time}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_MACH_ABSOLUTE_TIME])
+fi
+
+dnl Check if syscall(2) is usable.  Treat warnings as errors, so that e.g. OS X
+dnl 10.12's deprecation warning prevents use.
+SAVED_CFLAGS="${CFLAGS}"
+JE_CFLAGS_APPEND([-Werror])
+JE_COMPILABLE([syscall(2)], [
+#include <sys/syscall.h>
+#include <unistd.h>
+], [
+	syscall(SYS_write, 2, "hello", 5);
+],
+              [je_cv_syscall])
+CFLAGS="${SAVED_CFLAGS}"
+if test "x$je_cv_syscall" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_SYSCALL], [ ])
+fi
+
 dnl Check if the GNU-specific secure_getenv function exists.
 AC_CHECK_FUNC([secure_getenv],
               [have_secure_getenv="1"],
@@ -1298,9 +1456,17 @@ fi
 ],
 [enable_lazy_lock=""]
 )
-if test "x$enable_lazy_lock" = "x" -a "x${force_lazy_lock}" = "x1" ; then
-  AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues])
-  enable_lazy_lock="1"
+if test "x${enable_lazy_lock}" = "x" ; then
+  if test "x${force_lazy_lock}" = "x1" ; then
+    AC_MSG_RESULT([Forcing lazy-lock to avoid allocator/threading bootstrap issues])
+    enable_lazy_lock="1"
+  else
+    enable_lazy_lock="0"
+  fi
+fi
+if test "x${enable_lazy_lock}" = "x1" -a "x${abi}" = "xpecoff" ; then
+  AC_MSG_RESULT([Forcing no lazy-lock because thread creation monitoring is unimplemented])
+  enable_lazy_lock="0"
 fi
 if test "x$enable_lazy_lock" = "x1" ; then
   if test "x$abi" != "xpecoff" ; then
@@ -1311,8 +1477,6 @@ if test "x$enable_lazy_lock" = "x1" ; then
       ])
   fi
   AC_DEFINE([JEMALLOC_LAZY_LOCK], [ ])
-else
-  enable_lazy_lock="0"
 fi
 AC_SUBST([enable_lazy_lock])
 
@@ -1500,6 +1664,20 @@ if test "x${je_cv_builtin_clz}" = "xyes" ; then
   AC_DEFINE([JEMALLOC_HAVE_BUILTIN_CLZ], [ ])
 fi
 
+dnl ============================================================================
+dnl Check for os_unfair_lock operations as provided on Darwin.
+
+JE_COMPILABLE([Darwin os_unfair_lock_*()], [
+#include <os/lock.h>
+], [
+	os_unfair_lock lock = OS_UNFAIR_LOCK_INIT;
+	os_unfair_lock_lock(&lock);
+	os_unfair_lock_unlock(&lock);
+], [je_cv_os_unfair_lock])
+if test "x${je_cv_os_unfair_lock}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_OS_UNFAIR_LOCK], [ ])
+fi
+
 dnl ============================================================================
 dnl Check for spinlock(3) operations as provided on Darwin.
 
@@ -1744,6 +1922,7 @@ AC_MSG_RESULT([])
 AC_MSG_RESULT([CONFIG             : ${CONFIG}])
 AC_MSG_RESULT([CC                 : ${CC}])
 AC_MSG_RESULT([CFLAGS             : ${CFLAGS}])
+AC_MSG_RESULT([EXTRA_CFLAGS       : ${EXTRA_CFLAGS}])
 AC_MSG_RESULT([CPPFLAGS           : ${CPPFLAGS}])
 AC_MSG_RESULT([LDFLAGS            : ${LDFLAGS}])
 AC_MSG_RESULT([EXTRA_LDFLAGS      : ${EXTRA_LDFLAGS}])
diff --git a/doc/html.xsl.in b/doc/html.xsl.in
index a91d9746..ec4fa655 100644
--- a/doc/html.xsl.in
+++ b/doc/html.xsl.in
@@ -1,4 +1,5 @@
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
   <xsl:import href="@XSLROOT@/html/docbook.xsl"/>
   <xsl:import href="@abs_srcroot@doc/stylesheet.xsl"/>
+  <xsl:output method="xml" encoding="utf-8"/>
 </xsl:stylesheet>
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index c4a44e3c..3d2e721d 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -52,7 +52,7 @@
     <title>LIBRARY</title>
     <para>This manual describes jemalloc @jemalloc_version@.  More information
     can be found at the <ulink
-    url="http://www.canonware.com/jemalloc/">jemalloc website</ulink>.</para>
+    url="http://jemalloc.net/">jemalloc website</ulink>.</para>
   </refsect1>
   <refsynopsisdiv>
     <title>SYNOPSIS</title>
@@ -180,20 +180,20 @@
     <refsect2>
       <title>Standard API</title>
 
-      <para>The <function>malloc<parameter/></function> function allocates
+      <para>The <function>malloc()</function> function allocates
       <parameter>size</parameter> bytes of uninitialized memory.  The allocated
       space is suitably aligned (after possible pointer coercion) for storage
       of any type of object.</para>
 
-      <para>The <function>calloc<parameter/></function> function allocates
+      <para>The <function>calloc()</function> function allocates
       space for <parameter>number</parameter> objects, each
       <parameter>size</parameter> bytes in length.  The result is identical to
-      calling <function>malloc<parameter/></function> with an argument of
+      calling <function>malloc()</function> with an argument of
       <parameter>number</parameter> * <parameter>size</parameter>, with the
       exception that the allocated memory is explicitly initialized to zero
       bytes.</para>
 
-      <para>The <function>posix_memalign<parameter/></function> function
+      <para>The <function>posix_memalign()</function> function
       allocates <parameter>size</parameter> bytes of memory such that the
       allocation's base address is a multiple of
       <parameter>alignment</parameter>, and returns the allocation in the value
@@ -201,7 +201,7 @@
       <parameter>alignment</parameter> must be a power of 2 at least as large as
       <code language="C">sizeof(<type>void *</type>)</code>.</para>
 
-      <para>The <function>aligned_alloc<parameter/></function> function
+      <para>The <function>aligned_alloc()</function> function
       allocates <parameter>size</parameter> bytes of memory such that the
       allocation's base address is a multiple of
       <parameter>alignment</parameter>.  The requested
@@ -209,7 +209,7 @@
       undefined if <parameter>size</parameter> is not an integral multiple of
       <parameter>alignment</parameter>.</para>
 
-      <para>The <function>realloc<parameter/></function> function changes the
+      <para>The <function>realloc()</function> function changes the
       size of the previously allocated memory referenced by
       <parameter>ptr</parameter> to <parameter>size</parameter> bytes.  The
       contents of the memory are unchanged up to the lesser of the new and old
@@ -217,26 +217,26 @@
       portion of the memory are undefined.  Upon success, the memory referenced
       by <parameter>ptr</parameter> is freed and a pointer to the newly
       allocated memory is returned.  Note that
-      <function>realloc<parameter/></function> may move the memory allocation,
+      <function>realloc()</function> may move the memory allocation,
       resulting in a different return value than <parameter>ptr</parameter>.
       If <parameter>ptr</parameter> is <constant>NULL</constant>, the
-      <function>realloc<parameter/></function> function behaves identically to
-      <function>malloc<parameter/></function> for the specified size.</para>
+      <function>realloc()</function> function behaves identically to
+      <function>malloc()</function> for the specified size.</para>
 
-      <para>The <function>free<parameter/></function> function causes the
+      <para>The <function>free()</function> function causes the
       allocated memory referenced by <parameter>ptr</parameter> to be made
       available for future allocations.  If <parameter>ptr</parameter> is
       <constant>NULL</constant>, no action occurs.</para>
     </refsect2>
     <refsect2>
       <title>Non-standard API</title>
-      <para>The <function>mallocx<parameter/></function>,
-      <function>rallocx<parameter/></function>,
-      <function>xallocx<parameter/></function>,
-      <function>sallocx<parameter/></function>,
-      <function>dallocx<parameter/></function>,
-      <function>sdallocx<parameter/></function>, and
-      <function>nallocx<parameter/></function> functions all have a
+      <para>The <function>mallocx()</function>,
+      <function>rallocx()</function>,
+      <function>xallocx()</function>,
+      <function>sallocx()</function>,
+      <function>dallocx()</function>,
+      <function>sdallocx()</function>, and
+      <function>nallocx()</function> functions all have a
       <parameter>flags</parameter> argument that can be used to specify
       options.  The functions only check the options that are contextually
       relevant.  Use bitwise or (<code language="C">|</code>) operations to
@@ -307,19 +307,19 @@
         </variablelist>
       </para>
 
-      <para>The <function>mallocx<parameter/></function> function allocates at
+      <para>The <function>mallocx()</function> function allocates at
       least <parameter>size</parameter> bytes of memory, and returns a pointer
       to the base address of the allocation.  Behavior is undefined if
       <parameter>size</parameter> is <constant>0</constant>.</para>
 
-      <para>The <function>rallocx<parameter/></function> function resizes the
+      <para>The <function>rallocx()</function> function resizes the
       allocation at <parameter>ptr</parameter> to be at least
       <parameter>size</parameter> bytes, and returns a pointer to the base
       address of the resulting allocation, which may or may not have moved from
       its original location.  Behavior is undefined if
       <parameter>size</parameter> is <constant>0</constant>.</para>
 
-      <para>The <function>xallocx<parameter/></function> function resizes the
+      <para>The <function>xallocx()</function> function resizes the
       allocation at <parameter>ptr</parameter> in place to be at least
       <parameter>size</parameter> bytes, and returns the real size of the
       allocation.  If <parameter>extra</parameter> is non-zero, an attempt is
@@ -332,32 +332,32 @@
       language="C">(<parameter>size</parameter> + <parameter>extra</parameter>
       &gt; <constant>SIZE_T_MAX</constant>)</code>.</para>
 
-      <para>The <function>sallocx<parameter/></function> function returns the
+      <para>The <function>sallocx()</function> function returns the
       real size of the allocation at <parameter>ptr</parameter>.</para>
 
-      <para>The <function>dallocx<parameter/></function> function causes the
+      <para>The <function>dallocx()</function> function causes the
       memory referenced by <parameter>ptr</parameter> to be made available for
       future allocations.</para>
 
-      <para>The <function>sdallocx<parameter/></function> function is an
-      extension of <function>dallocx<parameter/></function> with a
+      <para>The <function>sdallocx()</function> function is an
+      extension of <function>dallocx()</function> with a
       <parameter>size</parameter> parameter to allow the caller to pass in the
       allocation size as an optimization.  The minimum valid input size is the
       original requested size of the allocation, and the maximum valid input
       size is the corresponding value returned by
-      <function>nallocx<parameter/></function> or
-      <function>sallocx<parameter/></function>.</para>
+      <function>nallocx()</function> or
+      <function>sallocx()</function>.</para>
 
-      <para>The <function>nallocx<parameter/></function> function allocates no
+      <para>The <function>nallocx()</function> function allocates no
       memory, but it performs the same size computation as the
-      <function>mallocx<parameter/></function> function, and returns the real
+      <function>mallocx()</function> function, and returns the real
       size of the allocation that would result from the equivalent
-      <function>mallocx<parameter/></function> function call, or
+      <function>mallocx()</function> function call, or
       <constant>0</constant> if the inputs exceed the maximum supported size
       class and/or alignment.  Behavior is undefined if
       <parameter>size</parameter> is <constant>0</constant>.</para>
 
-      <para>The <function>mallctl<parameter/></function> function provides a
+      <para>The <function>mallctl()</function> function provides a
       general interface for introspecting the memory allocator, as well as
       setting modifiable parameters and triggering actions.  The
       period-separated <parameter>name</parameter> argument specifies a
@@ -372,12 +372,12 @@
       <parameter>newlen</parameter>; otherwise pass <constant>NULL</constant>
       and <constant>0</constant>.</para>
 
-      <para>The <function>mallctlnametomib<parameter/></function> function
+      <para>The <function>mallctlnametomib()</function> function
       provides a way to avoid repeated name lookups for applications that
       repeatedly query the same portion of the namespace, by translating a name
-      to a &ldquo;Management Information Base&rdquo; (MIB) that can be passed
-      repeatedly to <function>mallctlbymib<parameter/></function>.  Upon
-      successful return from <function>mallctlnametomib<parameter/></function>,
+      to a <quote>Management Information Base</quote> (MIB) that can be passed
+      repeatedly to <function>mallctlbymib()</function>.  Upon
+      successful return from <function>mallctlnametomib()</function>,
       <parameter>mibp</parameter> contains an array of
       <parameter>*miblenp</parameter> integers, where
       <parameter>*miblenp</parameter> is the lesser of the number of components
@@ -410,39 +410,40 @@ for (i = 0; i < nbins; i++) {
 	/* Do something with bin_size... */
 }]]></programlisting></para>
 
-      <para>The <function>malloc_stats_print<parameter/></function> function
-      writes human-readable summary statistics via the
-      <parameter>write_cb</parameter> callback function pointer and
-      <parameter>cbopaque</parameter> data passed to
-      <parameter>write_cb</parameter>, or
-      <function>malloc_message<parameter/></function> if
-      <parameter>write_cb</parameter> is <constant>NULL</constant>.  This
-      function can be called repeatedly.  General information that never
-      changes during execution can be omitted by specifying "g" as a character
+      <para>The <function>malloc_stats_print()</function> function writes
+      summary statistics via the <parameter>write_cb</parameter> callback
+      function pointer and <parameter>cbopaque</parameter> data passed to
+      <parameter>write_cb</parameter>, or <function>malloc_message()</function>
+      if <parameter>write_cb</parameter> is <constant>NULL</constant>.  The
+      statistics are presented in human-readable form unless <quote>J</quote> is
+      specified as a character within the <parameter>opts</parameter> string, in
+      which case the statistics are presented in <ulink
+      url="http://www.json.org/">JSON format</ulink>.  This function can be
+      called repeatedly.  General information that never changes during
+      execution can be omitted by specifying <quote>g</quote> as a character
       within the <parameter>opts</parameter> string.  Note that
-      <function>malloc_message<parameter/></function> uses the
-      <function>mallctl*<parameter/></function> functions internally, so
-      inconsistent statistics can be reported if multiple threads use these
-      functions simultaneously.  If <option>--enable-stats</option> is
-      specified during configuration, &ldquo;m&rdquo; and &ldquo;a&rdquo; can
-      be specified to omit merged arena and per arena statistics, respectively;
-      &ldquo;b&rdquo;, &ldquo;l&rdquo;, and &ldquo;h&rdquo; can be specified to
-      omit per size class statistics for bins, large objects, and huge objects,
-      respectively.  Unrecognized characters are silently ignored.  Note that
-      thread caching may prevent some statistics from being completely up to
-      date, since extra locking would be required to merge counters that track
-      thread cache operations.
-      </para>
+      <function>malloc_message()</function> uses the
+      <function>mallctl*()</function> functions internally, so inconsistent
+      statistics can be reported if multiple threads use these functions
+      simultaneously.  If <option>--enable-stats</option> is specified during
+      configuration, <quote>m</quote> and <quote>a</quote> can be specified to
+      omit merged arena and per arena statistics, respectively;
+      <quote>b</quote>, <quote>l</quote>, and <quote>h</quote> can be specified
+      to omit per size class statistics for bins, large objects, and huge
+      objects, respectively.  Unrecognized characters are silently ignored.
+      Note that thread caching may prevent some statistics from being completely
+      up to date, since extra locking would be required to merge counters that
+      track thread cache operations.</para>
 
-      <para>The <function>malloc_usable_size<parameter/></function> function
+      <para>The <function>malloc_usable_size()</function> function
       returns the usable size of the allocation pointed to by
       <parameter>ptr</parameter>.  The return value may be larger than the size
       that was requested during allocation.  The
-      <function>malloc_usable_size<parameter/></function> function is not a
-      mechanism for in-place <function>realloc<parameter/></function>; rather
+      <function>malloc_usable_size()</function> function is not a
+      mechanism for in-place <function>realloc()</function>; rather
       it is provided solely as a tool for introspection purposes.  Any
       discrepancy between the requested allocation size and the size reported
-      by <function>malloc_usable_size<parameter/></function> should not be
+      by <function>malloc_usable_size()</function> should not be
       depended on, since such behavior is entirely implementation-dependent.
       </para>
     </refsect2>
@@ -455,12 +456,12 @@ for (i = 0; i < nbins; i++) {
 
     <para>The string specified via <option>--with-malloc-conf</option>, the
     string pointed to by the global variable <varname>malloc_conf</varname>, the
-    &ldquo;name&rdquo; of the file referenced by the symbolic link named
+    <quote>name</quote> of the file referenced by the symbolic link named
     <filename class="symlink">/etc/malloc.conf</filename>, and the value of the
     environment variable <envar>MALLOC_CONF</envar>, will be interpreted, in
     that order, from left to right as options.  Note that
     <varname>malloc_conf</varname> may be read before
-    <function>main<parameter/></function> is entered, so the declaration of
+    <function>main()</function> is entered, so the declaration of
     <varname>malloc_conf</varname> should specify an initializer that contains
     the final value to be read by jemalloc.  <option>--with-malloc-conf</option>
     and <varname>malloc_conf</varname> are compile-time mechanisms, whereas
@@ -549,14 +550,14 @@ for (i = 0; i < nbins; i++) {
     nearest multiple of the cacheline size, or specify cacheline alignment when
     allocating.</para>
 
-    <para>The <function>realloc<parameter/></function>,
-    <function>rallocx<parameter/></function>, and
-    <function>xallocx<parameter/></function> functions may resize allocations
+    <para>The <function>realloc()</function>,
+    <function>rallocx()</function>, and
+    <function>xallocx()</function> functions may resize allocations
     without moving them under limited circumstances.  Unlike the
-    <function>*allocx<parameter/></function> API, the standard API does not
+    <function>*allocx()</function> API, the standard API does not
     officially round up the usable size of an allocation to the nearest size
     class, so technically it is necessary to call
-    <function>realloc<parameter/></function> to grow e.g. a 9-byte allocation to
+    <function>realloc()</function> to grow e.g. a 9-byte allocation to
     16 bytes, or shrink a 16-byte allocation to 9 bytes.  Growth and shrinkage
     trivially succeeds in place as long as the pre-size and post-size both round
     up to the same size class.  No other API guarantees are made regarding
@@ -702,7 +703,7 @@ for (i = 0; i < nbins; i++) {
   <refsect1 id="mallctl_namespace">
     <title>MALLCTL NAMESPACE</title>
     <para>The following names are defined in the namespace accessible via the
-    <function>mallctl*<parameter/></function> functions.  Value types are
+    <function>mallctl*()</function> functions.  Value types are
     specified in parentheses, their readable/writable statuses are encoded as
     <literal>rw</literal>, <literal>r-</literal>, <literal>-w</literal>, or
     <literal>--</literal>, and required build configuration flags follow, if
@@ -733,7 +734,7 @@ for (i = 0; i < nbins; i++) {
           <literal>rw</literal>
         </term>
         <listitem><para>If a value is passed in, refresh the data from which
-        the <function>mallctl*<parameter/></function> functions report values,
+        the <function>mallctl*()</function> functions report values,
         and increment the epoch.  Return the current epoch.  This is useful for
         detecting whether another thread caused a refresh.</para></listitem>
       </varlistentry>
@@ -917,12 +918,12 @@ for (i = 0; i < nbins; i++) {
         settings are supported if
         <citerefentry><refentrytitle>sbrk</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> is supported by the operating
-        system: &ldquo;disabled&rdquo;, &ldquo;primary&rdquo;, and
-        &ldquo;secondary&rdquo;; otherwise only &ldquo;disabled&rdquo; is
-        supported.  The default is &ldquo;secondary&rdquo; if
+        system: <quote>disabled</quote>, <quote>primary</quote>, and
+        <quote>secondary</quote>; otherwise only <quote>disabled</quote> is
+        supported.  The default is <quote>secondary</quote> if
         <citerefentry><refentrytitle>sbrk</refentrytitle>
         <manvolnum>2</manvolnum></citerefentry> is supported by the operating
-        system; &ldquo;disabled&rdquo; otherwise.
+        system; <quote>disabled</quote> otherwise.
         </para></listitem>
       </varlistentry>
 
@@ -1013,19 +1014,19 @@ for (i = 0; i < nbins; i++) {
           <literal>r-</literal>
         </term>
         <listitem><para>Enable/disable statistics printing at exit.  If
-        enabled, the <function>malloc_stats_print<parameter/></function>
+        enabled, the <function>malloc_stats_print()</function>
         function is called at program exit via an
         <citerefentry><refentrytitle>atexit</refentrytitle>
         <manvolnum>3</manvolnum></citerefentry> function.  If
         <option>--enable-stats</option> is specified during configuration, this
         has the potential to cause deadlock for a multi-threaded process that
         exits while one or more threads are executing in the memory allocation
-        functions.  Furthermore, <function>atexit<parameter/></function> may
+        functions.  Furthermore, <function>atexit()</function> may
         allocate memory during application initialization and then deadlock
         internally when jemalloc in turn calls
-        <function>atexit<parameter/></function>, so this option is not
+        <function>atexit()</function>, so this option is not
         universally usable (though the application can register its own
-        <function>atexit<parameter/></function> function with equivalent
+        <function>atexit()</function> function with equivalent
         functionality).  Therefore, this option should only be used with care;
         it is primarily intended as a performance tuning aid during application
         development.  This option is disabled by default.</para></listitem>
@@ -1038,15 +1039,16 @@ for (i = 0; i < nbins; i++) {
           <literal>r-</literal>
           [<option>--enable-fill</option>]
         </term>
-        <listitem><para>Junk filling.  If set to "alloc", each byte of
-        uninitialized allocated memory will be initialized to
-        <literal>0xa5</literal>.  If set to "free", all deallocated memory will
-        be initialized to <literal>0x5a</literal>.  If set to "true", both
-        allocated and deallocated memory will be initialized, and if set to
-        "false", junk filling be disabled entirely.  This is intended for
-        debugging and will impact performance negatively.  This option is
-        "false" by default unless <option>--enable-debug</option> is specified
-        during configuration, in which case it is "true" by default unless
+        <listitem><para>Junk filling.  If set to <quote>alloc</quote>, each byte
+        of uninitialized allocated memory will be initialized to
+        <literal>0xa5</literal>.  If set to <quote>free</quote>, all deallocated
+        memory will be initialized to <literal>0x5a</literal>.  If set to
+        <quote>true</quote>, both allocated and deallocated memory will be
+        initialized, and if set to <quote>false</quote>, junk filling be
+        disabled entirely.  This is intended for debugging and will impact
+        performance negatively.  This option is <quote>false</quote> by default
+        unless <option>--enable-debug</option> is specified during
+        configuration, in which case it is <quote>true</quote> by default unless
         running inside <ulink
         url="http://valgrind.org/">Valgrind</ulink>.</para></listitem>
       </varlistentry>
@@ -1101,8 +1103,8 @@ for (i = 0; i < nbins; i++) {
         <listitem><para>Zero filling enabled/disabled.  If enabled, each byte
         of uninitialized allocated memory will be initialized to 0.  Note that
         this initialization only happens once for each byte, so
-        <function>realloc<parameter/></function> and
-        <function>rallocx<parameter/></function> calls do not zero memory that
+        <function>realloc()</function> and
+        <function>rallocx()</function> calls do not zero memory that
         was previously allocated.  This is intended for debugging and will
         impact performance negatively.  This option is disabled by default.
         </para></listitem>
@@ -1325,11 +1327,11 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <filename>&lt;prefix&gt;.&lt;pid&gt;.&lt;seq&gt;.f.heap</filename>,
         where <literal>&lt;prefix&gt;</literal> is controlled by the <link
         linkend="opt.prof_prefix"><mallctl>opt.prof_prefix</mallctl></link>
-        option.  Note that <function>atexit<parameter/></function> may allocate
+        option.  Note that <function>atexit()</function> may allocate
         memory during application initialization and then deadlock internally
-        when jemalloc in turn calls <function>atexit<parameter/></function>, so
+        when jemalloc in turn calls <function>atexit()</function>, so
         this option is not universally usable (though the application can
-        register its own <function>atexit<parameter/></function> function with
+        register its own <function>atexit()</function> function with
         equivalent functionality).  This option is disabled by
         default.</para></listitem>
       </varlistentry>
@@ -1388,7 +1390,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <link
         linkend="thread.allocated"><mallctl>thread.allocated</mallctl></link>
         mallctl.  This is useful for avoiding the overhead of repeated
-        <function>mallctl*<parameter/></function> calls.</para></listitem>
+        <function>mallctl*()</function> calls.</para></listitem>
       </varlistentry>
 
       <varlistentry id="thread.deallocated">
@@ -1415,7 +1417,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <link
         linkend="thread.deallocated"><mallctl>thread.deallocated</mallctl></link>
         mallctl.  This is useful for avoiding the overhead of repeated
-        <function>mallctl*<parameter/></function> calls.</para></listitem>
+        <function>mallctl*()</function> calls.</para></listitem>
       </varlistentry>
 
       <varlistentry id="thread.tcache.enabled">
@@ -2734,7 +2736,7 @@ MAPPED_LIBRARIES:
     of run-time assertions that catch application errors such as double-free,
     write-after-free, etc.</para>
 
-    <para>Programs often accidentally depend on &ldquo;uninitialized&rdquo;
+    <para>Programs often accidentally depend on <quote>uninitialized</quote>
     memory actually being filled with zero bytes.  Junk filling
     (see the <link linkend="opt.junk"><mallctl>opt.junk</mallctl></link>
     option) tends to expose such bugs in the form of obviously incorrect
@@ -2763,29 +2765,29 @@ MAPPED_LIBRARIES:
     to override the function which emits the text strings forming the errors
     and warnings if for some reason the <constant>STDERR_FILENO</constant> file
     descriptor is not suitable for this.
-    <function>malloc_message<parameter/></function> takes the
+    <function>malloc_message()</function> takes the
     <parameter>cbopaque</parameter> pointer argument that is
     <constant>NULL</constant> unless overridden by the arguments in a call to
-    <function>malloc_stats_print<parameter/></function>, followed by a string
+    <function>malloc_stats_print()</function>, followed by a string
     pointer.  Please note that doing anything which tries to allocate memory in
     this function is likely to result in a crash or deadlock.</para>
 
     <para>All messages are prefixed by
-    &ldquo;<computeroutput>&lt;jemalloc&gt;: </computeroutput>&rdquo;.</para>
+    <quote><computeroutput>&lt;jemalloc&gt;: </computeroutput></quote>.</para>
   </refsect1>
   <refsect1 id="return_values">
     <title>RETURN VALUES</title>
     <refsect2>
       <title>Standard API</title>
-      <para>The <function>malloc<parameter/></function> and
-      <function>calloc<parameter/></function> functions return a pointer to the
+      <para>The <function>malloc()</function> and
+      <function>calloc()</function> functions return a pointer to the
       allocated memory if successful; otherwise a <constant>NULL</constant>
       pointer is returned and <varname>errno</varname> is set to
       <errorname>ENOMEM</errorname>.</para>
 
-      <para>The <function>posix_memalign<parameter/></function> function
+      <para>The <function>posix_memalign()</function> function
       returns the value 0 if successful; otherwise it returns an error value.
-      The <function>posix_memalign<parameter/></function> function will fail
+      The <function>posix_memalign()</function> function will fail
       if:
         <variablelist>
           <varlistentry>
@@ -2804,11 +2806,11 @@ MAPPED_LIBRARIES:
         </variablelist>
       </para>
 
-      <para>The <function>aligned_alloc<parameter/></function> function returns
+      <para>The <function>aligned_alloc()</function> function returns
       a pointer to the allocated memory if successful; otherwise a
       <constant>NULL</constant> pointer is returned and
       <varname>errno</varname> is set.  The
-      <function>aligned_alloc<parameter/></function> function will fail if:
+      <function>aligned_alloc()</function> function will fail if:
         <variablelist>
           <varlistentry>
             <term><errorname>EINVAL</errorname></term>
@@ -2825,44 +2827,44 @@ MAPPED_LIBRARIES:
         </variablelist>
       </para>
 
-      <para>The <function>realloc<parameter/></function> function returns a
+      <para>The <function>realloc()</function> function returns a
       pointer, possibly identical to <parameter>ptr</parameter>, to the
       allocated memory if successful; otherwise a <constant>NULL</constant>
       pointer is returned, and <varname>errno</varname> is set to
       <errorname>ENOMEM</errorname> if the error was the result of an
-      allocation failure.  The <function>realloc<parameter/></function>
+      allocation failure.  The <function>realloc()</function>
       function always leaves the original buffer intact when an error occurs.
       </para>
 
-      <para>The <function>free<parameter/></function> function returns no
+      <para>The <function>free()</function> function returns no
       value.</para>
     </refsect2>
     <refsect2>
       <title>Non-standard API</title>
-      <para>The <function>mallocx<parameter/></function> and
-      <function>rallocx<parameter/></function> functions return a pointer to
+      <para>The <function>mallocx()</function> and
+      <function>rallocx()</function> functions return a pointer to
       the allocated memory if successful; otherwise a <constant>NULL</constant>
       pointer is returned to indicate insufficient contiguous memory was
       available to service the allocation request.  </para>
 
-      <para>The <function>xallocx<parameter/></function> function returns the
+      <para>The <function>xallocx()</function> function returns the
       real size of the resulting resized allocation pointed to by
       <parameter>ptr</parameter>, which is a value less than
       <parameter>size</parameter> if the allocation could not be adequately
       grown in place.  </para>
 
-      <para>The <function>sallocx<parameter/></function> function returns the
+      <para>The <function>sallocx()</function> function returns the
       real size of the allocation pointed to by <parameter>ptr</parameter>.
       </para>
 
-      <para>The <function>nallocx<parameter/></function> returns the real size
+      <para>The <function>nallocx()</function> returns the real size
       that would result from a successful equivalent
-      <function>mallocx<parameter/></function> function call, or zero if
+      <function>mallocx()</function> function call, or zero if
       insufficient memory is available to perform the size computation.  </para>
 
-      <para>The <function>mallctl<parameter/></function>,
-      <function>mallctlnametomib<parameter/></function>, and
-      <function>mallctlbymib<parameter/></function> functions return 0 on
+      <para>The <function>mallctl()</function>,
+      <function>mallctlnametomib()</function>, and
+      <function>mallctlbymib()</function> functions return 0 on
       success; otherwise they return an error value.  The functions will fail
       if:
         <variablelist>
@@ -2898,13 +2900,13 @@ MAPPED_LIBRARIES:
             <term><errorname>EFAULT</errorname></term>
 
             <listitem><para>An interface with side effects failed in some way
-            not directly related to <function>mallctl*<parameter/></function>
+            not directly related to <function>mallctl*()</function>
             read/write processing.</para></listitem>
           </varlistentry>
         </variablelist>
       </para>
 
-      <para>The <function>malloc_usable_size<parameter/></function> function
+      <para>The <function>malloc_usable_size()</function> function
       returns the usable size of the allocation pointed to by
       <parameter>ptr</parameter>.  </para>
     </refsect2>
@@ -2952,13 +2954,13 @@ malloc_conf = "lg_chunk:24";]]></programlisting></para>
   </refsect1>
   <refsect1 id="standards">
     <title>STANDARDS</title>
-    <para>The <function>malloc<parameter/></function>,
-    <function>calloc<parameter/></function>,
-    <function>realloc<parameter/></function>, and
-    <function>free<parameter/></function> functions conform to ISO/IEC
-    9899:1990 (&ldquo;ISO C90&rdquo;).</para>
+    <para>The <function>malloc()</function>,
+    <function>calloc()</function>,
+    <function>realloc()</function>, and
+    <function>free()</function> functions conform to ISO/IEC
+    9899:1990 (<quote>ISO C90</quote>).</para>
 
-    <para>The <function>posix_memalign<parameter/></function> function conforms
-    to IEEE Std 1003.1-2001 (&ldquo;POSIX.1&rdquo;).</para>
+    <para>The <function>posix_memalign()</function> function conforms
+    to IEEE Std 1003.1-2001 (<quote>POSIX.1</quote>).</para>
   </refsect1>
 </refentry>
diff --git a/doc/stylesheet.xsl b/doc/stylesheet.xsl
index 4e334a86..619365d8 100644
--- a/doc/stylesheet.xsl
+++ b/doc/stylesheet.xsl
@@ -1,7 +1,10 @@
 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
   <xsl:param name="funcsynopsis.style">ansi</xsl:param>
-  <xsl:param name="function.parens" select="1"/>
+  <xsl:param name="function.parens" select="0"/>
+  <xsl:template match="function">
+    <xsl:call-template name="inline.monoseq"/>
+  </xsl:template>
   <xsl:template match="mallctl">
-    "<xsl:call-template name="inline.monoseq"/>"
+    <quote><xsl:call-template name="inline.monoseq"/></quote>
   </xsl:template>
 </xsl:stylesheet>
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index b1de2b61..1277d080 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -42,6 +42,7 @@ typedef struct arena_chunk_map_bits_s arena_chunk_map_bits_t;
 typedef struct arena_chunk_map_misc_s arena_chunk_map_misc_t;
 typedef struct arena_chunk_s arena_chunk_t;
 typedef struct arena_bin_info_s arena_bin_info_t;
+typedef struct arena_decay_s arena_decay_t;
 typedef struct arena_bin_s arena_bin_t;
 typedef struct arena_s arena_t;
 typedef struct arena_tdata_s arena_tdata_t;
@@ -257,6 +258,49 @@ struct arena_bin_info_s {
 	uint32_t		reg0_offset;
 };
 
+struct arena_decay_s {
+	/*
+	 * Approximate time in seconds from the creation of a set of unused
+	 * dirty pages until an equivalent set of unused dirty pages is purged
+	 * and/or reused.
+	 */
+	ssize_t			time;
+	/* time / SMOOTHSTEP_NSTEPS. */
+	nstime_t		interval;
+	/*
+	 * Time at which the current decay interval logically started.  We do
+	 * not actually advance to a new epoch until sometime after it starts
+	 * because of scheduling and computation delays, and it is even possible
+	 * to completely skip epochs.  In all cases, during epoch advancement we
+	 * merge all relevant activity into the most recently recorded epoch.
+	 */
+	nstime_t		epoch;
+	/* Deadline randomness generator. */
+	uint64_t		jitter_state;
+	/*
+	 * Deadline for current epoch.  This is the sum of interval and per
+	 * epoch jitter which is a uniform random variable in [0..interval).
+	 * Epochs always advance by precise multiples of interval, but we
+	 * randomize the deadline to reduce the likelihood of arenas purging in
+	 * lockstep.
+	 */
+	nstime_t		deadline;
+	/*
+	 * Number of dirty pages at beginning of current epoch.  During epoch
+	 * advancement we use the delta between arena->decay.ndirty and
+	 * arena->ndirty to determine how many dirty pages, if any, were
+	 * generated.
+	 */
+	size_t			ndirty;
+	/*
+	 * Trailing log of how many unused dirty pages were generated during
+	 * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last
+	 * element is the most recent epoch.  Corresponding epoch times are
+	 * relative to epoch.
+	 */
+	size_t			backlog[SMOOTHSTEP_NSTEPS];
+};
+
 struct arena_bin_s {
 	/*
 	 * All operations on runcur, runs, and stats require that lock be
@@ -394,52 +438,8 @@ struct arena_s {
 	arena_runs_dirty_link_t	runs_dirty;
 	extent_node_t		chunks_cache;
 
-	/*
-	 * Approximate time in seconds from the creation of a set of unused
-	 * dirty pages until an equivalent set of unused dirty pages is purged
-	 * and/or reused.
-	 */
-	ssize_t			decay_time;
-	/* decay_time / SMOOTHSTEP_NSTEPS. */
-	nstime_t		decay_interval;
-	/*
-	 * Time at which the current decay interval logically started.  We do
-	 * not actually advance to a new epoch until sometime after it starts
-	 * because of scheduling and computation delays, and it is even possible
-	 * to completely skip epochs.  In all cases, during epoch advancement we
-	 * merge all relevant activity into the most recently recorded epoch.
-	 */
-	nstime_t		decay_epoch;
-	/* decay_deadline randomness generator. */
-	uint64_t		decay_jitter_state;
-	/*
-	 * Deadline for current epoch.  This is the sum of decay_interval and
-	 * per epoch jitter which is a uniform random variable in
-	 * [0..decay_interval).  Epochs always advance by precise multiples of
-	 * decay_interval, but we randomize the deadline to reduce the
-	 * likelihood of arenas purging in lockstep.
-	 */
-	nstime_t		decay_deadline;
-	/*
-	 * Number of dirty pages at beginning of current epoch.  During epoch
-	 * advancement we use the delta between decay_ndirty and ndirty to
-	 * determine how many dirty pages, if any, were generated, and record
-	 * the result in decay_backlog.
-	 */
-	size_t			decay_ndirty;
-	/*
-	 * Memoized result of arena_decay_backlog_npages_limit() corresponding
-	 * to the current contents of decay_backlog, i.e. the limit on how many
-	 * pages are allowed to exist for the decay epochs.
-	 */
-	size_t			decay_backlog_npages_limit;
-	/*
-	 * Trailing log of how many unused dirty pages were generated during
-	 * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last
-	 * element is the most recent epoch.  Corresponding epoch times are
-	 * relative to decay_epoch.
-	 */
-	size_t			decay_backlog[SMOOTHSTEP_NSTEPS];
+	/* Decay-based purging state. */
+	arena_decay_t		decay;
 
 	/* Extant huge allocations. */
 	ql_head(extent_node_t)	huge;
@@ -470,10 +470,12 @@ struct arena_s {
 	arena_bin_t		bins[NBINS];
 
 	/*
-	 * Quantized address-ordered heaps of this arena's available runs.  The
-	 * heaps are used for first-best-fit run allocation.
+	 * Size-segregated address-ordered heaps of this arena's available runs,
+	 * used for first-best-fit run allocation.  Runs are quantized, i.e.
+	 * they reside in the last heap which corresponds to a size class less
+	 * than or equal to the run size.
 	 */
-	arena_run_heap_t	runs_avail[1]; /* Dynamically sized. */
+	arena_run_heap_t	runs_avail[NPSIZES];
 };
 
 /* Used in conjunction with tsd for fast arena-related context lookup. */
@@ -505,7 +507,6 @@ extern size_t		map_bias; /* Number of arena chunk header pages. */
 extern size_t		map_misc_offset;
 extern size_t		arena_maxrun; /* Max run size for arenas. */
 extern size_t		large_maxclass; /* Max large size class. */
-extern size_t		run_quantize_max; /* Max run_quantize_*() input. */
 extern unsigned		nlclasses; /* Number of large size classes. */
 extern unsigned		nhclasses; /* Number of huge size classes. */
 
@@ -601,7 +602,7 @@ unsigned	arena_nthreads_get(arena_t *arena, bool internal);
 void	arena_nthreads_inc(arena_t *arena, bool internal);
 void	arena_nthreads_dec(arena_t *arena, bool internal);
 arena_t	*arena_new(tsdn_t *tsdn, unsigned ind);
-bool	arena_boot(void);
+void	arena_boot(void);
 void	arena_prefork0(tsdn_t *tsdn, arena_t *arena);
 void	arena_prefork1(tsdn_t *tsdn, arena_t *arena);
 void	arena_prefork2(tsdn_t *tsdn, arena_t *arena);
diff --git a/include/jemalloc/internal/atomic.h b/include/jemalloc/internal/atomic.h
index 3f15ea14..3936f68b 100644
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@@ -66,8 +66,7 @@ void	atomic_write_u(unsigned *p, unsigned x);
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
 /******************************************************************************/
 /* 64-bit operations. */
-#if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  if (defined(__amd64__) || defined(__x86_64__))
+#if (defined(__amd64__) || defined(__x86_64__))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -125,7 +124,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 	    : "memory" /* Clobbers. */
 	    );
 }
-#  elif (defined(JEMALLOC_C11ATOMICS))
+#elif (defined(JEMALLOC_C11ATOMICS))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -153,7 +152,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
 	atomic_store(a, x);
 }
-#  elif (defined(JEMALLOC_ATOMIC9))
+#elif (defined(JEMALLOC_ATOMIC9))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -193,7 +192,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 
 	atomic_store_rel_long(p, x);
 }
-#  elif (defined(JEMALLOC_OSATOMIC))
+#elif (defined(JEMALLOC_OSATOMIC))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -225,7 +224,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 		o = atomic_read_uint64(p);
 	} while (atomic_cas_uint64(p, o, x));
 }
-#  elif (defined(_MSC_VER))
+#elif (defined(_MSC_VER))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
 {
@@ -255,7 +254,7 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 
 	InterlockedExchange64(p, x);
 }
-#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
+#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
     defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
 JEMALLOC_INLINE uint64_t
 atomic_add_uint64(uint64_t *p, uint64_t x)
@@ -284,9 +283,8 @@ atomic_write_uint64(uint64_t *p, uint64_t x)
 
 	__sync_lock_test_and_set(p, x);
 }
-#  else
-#    error "Missing implementation for 64-bit atomic operations"
-#  endif
+#else
+#  error "Missing implementation for 64-bit atomic operations"
 #endif
 
 /******************************************************************************/
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index c9fd4ecb..38c9a012 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -58,7 +58,7 @@ void	chunk_deregister(const void *chunk, const extent_node_t *node);
 void	*chunk_alloc_base(size_t size);
 void	*chunk_alloc_cache(tsdn_t *tsdn, arena_t *arena,
     chunk_hooks_t *chunk_hooks, void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool dalloc_node);
+    bool *zero, bool *commit, bool dalloc_node);
 void	*chunk_alloc_wrapper(tsdn_t *tsdn, arena_t *arena,
     chunk_hooks_t *chunk_hooks, void *new_addr, size_t size, size_t alignment,
     bool *zero, bool *commit);
@@ -71,9 +71,6 @@ bool	chunk_purge_wrapper(tsdn_t *tsdn, arena_t *arena,
     chunk_hooks_t *chunk_hooks, void *chunk, size_t size, size_t offset,
     size_t length);
 bool	chunk_boot(void);
-void	chunk_prefork(tsdn_t *tsdn);
-void	chunk_postfork_parent(tsdn_t *tsdn);
-void	chunk_postfork_child(tsdn_t *tsdn);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/chunk_dss.h b/include/jemalloc/internal/chunk_dss.h
index 724fa579..da8511ba 100644
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@@ -21,15 +21,13 @@ extern const char *dss_prec_names[];
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-dss_prec_t	chunk_dss_prec_get(tsdn_t *tsdn);
-bool	chunk_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec);
+dss_prec_t	chunk_dss_prec_get(void);
+bool	chunk_dss_prec_set(dss_prec_t dss_prec);
 void	*chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
     size_t size, size_t alignment, bool *zero, bool *commit);
-bool	chunk_in_dss(tsdn_t *tsdn, void *chunk);
-bool	chunk_dss_boot(void);
-void	chunk_dss_prefork(tsdn_t *tsdn);
-void	chunk_dss_postfork_parent(tsdn_t *tsdn);
-void	chunk_dss_postfork_child(tsdn_t *tsdn);
+bool	chunk_in_dss(void *chunk);
+bool	chunk_dss_mergeable(void *chunk_a, void *chunk_b);
+void	chunk_dss_boot(void);
 
 #endif /* JEMALLOC_H_EXTERNS */
 /******************************************************************************/
diff --git a/include/jemalloc/internal/ckh.h b/include/jemalloc/internal/ckh.h
index 46e151cd..f75ad90b 100644
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@@ -64,13 +64,13 @@ struct ckh_s {
 /******************************************************************************/
 #ifdef JEMALLOC_H_EXTERNS
 
-bool	ckh_new(tsdn_t *tsdn, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
     ckh_keycomp_t *keycomp);
-void	ckh_delete(tsdn_t *tsdn, ckh_t *ckh);
+void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
 size_t	ckh_count(ckh_t *ckh);
 bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(tsdn_t *tsdn, ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(tsdn_t *tsdn, ckh_t *ckh, const void *searchkey, void **key,
+bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
     void **data);
 bool	ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data);
 void	ckh_string_hash(const void *key, size_t r_hash[2]);
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index b5fa9e63..22184d9b 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -17,7 +17,7 @@ bool	huge_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize,
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t usize, size_t alignment, bool zero, tcache_t *tcache);
 #ifdef JEMALLOC_JET
-typedef void (huge_dalloc_junk_t)(tsdn_t *, void *, size_t);
+typedef void (huge_dalloc_junk_t)(void *, size_t);
 extern huge_dalloc_junk_t *huge_dalloc_junk;
 #endif
 void	huge_dalloc(tsdn_t *tsdn, void *ptr);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index 8f82edd4..fdc8fef9 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -162,7 +162,9 @@ static const bool config_cache_oblivious =
 #endif
 
 #include "jemalloc/internal/ph.h"
+#ifndef __PGI
 #define	RB_COMPACT
+#endif
 #include "jemalloc/internal/rb.h"
 #include "jemalloc/internal/qr.h"
 #include "jemalloc/internal/ql.h"
@@ -185,6 +187,9 @@ static const bool config_cache_oblivious =
 
 #include "jemalloc/internal/jemalloc_internal_macros.h"
 
+/* Page size index type. */
+typedef unsigned pszind_t;
+
 /* Size class index type. */
 typedef unsigned szind_t;
 
@@ -234,7 +239,7 @@ typedef unsigned szind_t;
 #  ifdef __alpha__
 #    define LG_QUANTUM		4
 #  endif
-#  if (defined(__sparc64__) || defined(__sparcv9))
+#  if (defined(__sparc64__) || defined(__sparcv9) || defined(__sparc_v9__))
 #    define LG_QUANTUM		4
 #  endif
 #  if (defined(__amd64__) || defined(__x86_64__) || defined(_M_X64))
@@ -364,6 +369,7 @@ typedef unsigned szind_t;
 #include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/ckh.h"
@@ -396,6 +402,7 @@ typedef unsigned szind_t;
 #include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/ckh.h"
@@ -455,11 +462,16 @@ extern unsigned	narenas_auto;
  */
 extern arena_t	**arenas;
 
+/*
+ * pind2sz_tab encodes the same information as could be computed by
+ * pind2sz_compute().
+ */
+extern size_t const	pind2sz_tab[NPSIZES];
 /*
  * index2size_tab encodes the same information as could be computed (at
  * unacceptable cost in some code paths) by index2size_compute().
  */
-extern size_t const	index2size_tab[NSIZES+1];
+extern size_t const	index2size_tab[NSIZES];
 /*
  * size2index_tab is a compact lookup table that rounds request sizes up to
  * size classes.  In order to reduce cache footprint, the table is compressed,
@@ -467,6 +479,7 @@ extern size_t const	index2size_tab[NSIZES+1];
  */
 extern uint8_t const	size2index_tab[];
 
+arena_t	*a0get(void);
 void	*a0malloc(size_t size);
 void	a0dalloc(void *ptr);
 void	*bootstrap_malloc(size_t size);
@@ -492,6 +505,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/ckh.h"
@@ -524,6 +538,7 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/valgrind.h"
 #include "jemalloc/internal/util.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/spin.h"
 #include "jemalloc/internal/prng.h"
 #include "jemalloc/internal/ticker.h"
 #include "jemalloc/internal/ckh.h"
@@ -543,6 +558,11 @@ void	jemalloc_postfork_child(void);
 #include "jemalloc/internal/huge.h"
 
 #ifndef JEMALLOC_ENABLE_INLINE
+pszind_t	psz2ind(size_t psz);
+size_t	pind2sz_compute(pszind_t pind);
+size_t	pind2sz_lookup(pszind_t pind);
+size_t	pind2sz(pszind_t pind);
+size_t	psz2u(size_t psz);
 szind_t	size2index_compute(size_t size);
 szind_t	size2index_lookup(size_t size);
 szind_t	size2index(size_t size);
@@ -555,7 +575,7 @@ size_t	s2u(size_t size);
 size_t	sa2u(size_t size, size_t alignment);
 arena_t	*arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal);
 arena_t	*arena_choose(tsd_t *tsd, arena_t *arena);
-arena_t	*arena_ichoose(tsdn_t *tsdn, arena_t *arena);
+arena_t	*arena_ichoose(tsd_t *tsd, arena_t *arena);
 arena_tdata_t	*arena_tdata_get(tsd_t *tsd, unsigned ind,
     bool refresh_if_missing);
 arena_t	*arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing);
@@ -563,10 +583,90 @@ ticker_t	*decay_ticker_get(tsd_t *tsd, unsigned ind);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_C_))
+JEMALLOC_INLINE pszind_t
+psz2ind(size_t psz)
+{
+
+	if (unlikely(psz > HUGE_MAXCLASS))
+		return (NPSIZES);
+	{
+		pszind_t x = lg_floor((psz<<1)-1);
+		pszind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_PAGE) ? 0 : x -
+		    (LG_SIZE_CLASS_GROUP + LG_PAGE);
+		pszind_t grp = shift << LG_SIZE_CLASS_GROUP;
+
+		pszind_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
+		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
+
+		size_t delta_inverse_mask = ZI(-1) << lg_delta;
+		pszind_t mod = ((((psz-1) & delta_inverse_mask) >> lg_delta)) &
+		    ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+
+		pszind_t ind = grp + mod;
+		return (ind);
+	}
+}
+
+JEMALLOC_INLINE size_t
+pind2sz_compute(pszind_t pind)
+{
+
+	{
+		size_t grp = pind >> LG_SIZE_CLASS_GROUP;
+		size_t mod = pind & ((ZU(1) << LG_SIZE_CLASS_GROUP) - 1);
+
+		size_t grp_size_mask = ~((!!grp)-1);
+		size_t grp_size = ((ZU(1) << (LG_PAGE +
+		    (LG_SIZE_CLASS_GROUP-1))) << grp) & grp_size_mask;
+
+		size_t shift = (grp == 0) ? 1 : grp;
+		size_t lg_delta = shift + (LG_PAGE-1);
+		size_t mod_size = (mod+1) << lg_delta;
+
+		size_t sz = grp_size + mod_size;
+		return (sz);
+	}
+}
+
+JEMALLOC_INLINE size_t
+pind2sz_lookup(pszind_t pind)
+{
+	size_t ret = (size_t)pind2sz_tab[pind];
+	assert(ret == pind2sz_compute(pind));
+	return (ret);
+}
+
+JEMALLOC_INLINE size_t
+pind2sz(pszind_t pind)
+{
+
+	assert(pind < NPSIZES);
+	return (pind2sz_lookup(pind));
+}
+
+JEMALLOC_INLINE size_t
+psz2u(size_t psz)
+{
+
+	if (unlikely(psz > HUGE_MAXCLASS))
+		return (0);
+	{
+		size_t x = lg_floor((psz<<1)-1);
+		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_PAGE + 1) ?
+		    LG_PAGE : x - LG_SIZE_CLASS_GROUP - 1;
+		size_t delta = ZU(1) << lg_delta;
+		size_t delta_mask = delta - 1;
+		size_t usize = (psz + delta_mask) & ~delta_mask;
+		return (usize);
+	}
+}
+
 JEMALLOC_INLINE szind_t
 size2index_compute(size_t size)
 {
 
+	if (unlikely(size > HUGE_MAXCLASS))
+		return (NSIZES);
 #if (NTBINS != 0)
 	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
 		szind_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
@@ -575,9 +675,7 @@ size2index_compute(size_t size)
 	}
 #endif
 	{
-		szind_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
-		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
-		    : lg_floor((size<<1)-1);
+		szind_t x = lg_floor((size<<1)-1);
 		szind_t shift = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM) ? 0 :
 		    x - (LG_SIZE_CLASS_GROUP + LG_QUANTUM);
 		szind_t grp = shift << LG_SIZE_CLASS_GROUP;
@@ -663,6 +761,8 @@ JEMALLOC_ALWAYS_INLINE size_t
 s2u_compute(size_t size)
 {
 
+	if (unlikely(size > HUGE_MAXCLASS))
+		return (0);
 #if (NTBINS > 0)
 	if (size <= (ZU(1) << LG_TINY_MAXCLASS)) {
 		size_t lg_tmin = LG_TINY_MAXCLASS - NTBINS + 1;
@@ -672,9 +772,7 @@ s2u_compute(size_t size)
 	}
 #endif
 	{
-		size_t x = unlikely(ZI(size) < 0) ? ((size<<1) ?
-		    (ZU(1)<<(LG_SIZEOF_PTR+3)) : ((ZU(1)<<(LG_SIZEOF_PTR+3))-1))
-		    : lg_floor((size<<1)-1);
+		size_t x = lg_floor((size<<1)-1);
 		size_t lg_delta = (x < LG_SIZE_CLASS_GROUP + LG_QUANTUM + 1)
 		    ?  LG_QUANTUM : x - LG_SIZE_CLASS_GROUP - 1;
 		size_t delta = ZU(1) << lg_delta;
@@ -815,14 +913,10 @@ arena_choose(tsd_t *tsd, arena_t *arena)
 }
 
 JEMALLOC_INLINE arena_t *
-arena_ichoose(tsdn_t *tsdn, arena_t *arena)
+arena_ichoose(tsd_t *tsd, arena_t *arena)
 {
 
-	assert(!tsdn_null(tsdn) || arena != NULL);
-
-	if (!tsdn_null(tsdn))
-		return (arena_choose_impl(tsdn_tsd(tsdn), NULL, true));
-	return (arena);
+	return (arena_choose_impl(tsd, arena, true));
 }
 
 JEMALLOC_INLINE arena_tdata_t *
diff --git a/include/jemalloc/internal/jemalloc_internal_decls.h b/include/jemalloc/internal/jemalloc_internal_decls.h
index 2b8ca5d0..c907d910 100644
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@@ -17,8 +17,18 @@
 #    include <sys/uio.h>
 #  endif
 #  include <pthread.h>
+#  ifdef JEMALLOC_OS_UNFAIR_LOCK
+#    include <os/lock.h>
+#  endif
+#  ifdef JEMALLOC_GLIBC_MALLOC_HOOK
+#    include <sched.h>
+#  endif
 #  include <errno.h>
 #  include <sys/time.h>
+#  include <time.h>
+#  ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+#    include <mach/mach_time.h>
+#  endif
 #endif
 #include <sys/types.h>
 
diff --git a/include/jemalloc/internal/jemalloc_internal_defs.h.in b/include/jemalloc/internal/jemalloc_internal_defs.h.in
index 7de0cf7c..9b3dca50 100644
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@@ -60,12 +60,20 @@
  */
 #undef JEMALLOC_HAVE_MADVISE
 
+/*
+ * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
+ */
+#undef JEMALLOC_OS_UNFAIR_LOCK
+
 /*
  * Defined if OSSpin*() functions are available, as provided by Darwin, and
  * documented in the spinlock(3) manual page.
  */
 #undef JEMALLOC_OSSPIN
 
+/* Defined if syscall(2) is available. */
+#undef JEMALLOC_HAVE_SYSCALL
+
 /*
  * Defined if secure_getenv(3) is available.
  */
@@ -76,6 +84,21 @@
  */
 #undef JEMALLOC_HAVE_ISSETUGID
 
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+
+/*
+ * Defined if clock_gettime(CLOCK_MONOTONIC, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_MONOTONIC
+
+/*
+ * Defined if mach_absolute_time() is available.
+ */
+#undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+
 /*
  * Defined if _malloc_thread_cleanup() exists.  At least in the case of
  * FreeBSD, pthread_key_create() allocates, which if used during malloc
@@ -188,6 +211,12 @@
 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS
 
+/*
+ * Used to mark unreachable code to quiet "end of non-void" compiler warnings.
+ * Don't use this directly; instead use unreachable() from util.h
+ */
+#undef JEMALLOC_INTERNAL_UNREACHABLE
+
 /*
  * ffs*() functions to use for bitmapping.  Don't use these directly; instead,
  * use ffs_*() from util.h.
diff --git a/include/jemalloc/internal/mb.h b/include/jemalloc/internal/mb.h
index 437c86f7..5384728f 100644
--- a/include/jemalloc/internal/mb.h
+++ b/include/jemalloc/internal/mb.h
@@ -105,8 +105,8 @@ mb_write(void)
 	malloc_mutex_t mtx;
 
 	malloc_mutex_init(&mtx, "mb", WITNESS_RANK_OMIT);
-	malloc_mutex_lock(NULL, &mtx);
-	malloc_mutex_unlock(NULL, &mtx);
+	malloc_mutex_lock(TSDN_NULL, &mtx);
+	malloc_mutex_unlock(TSDN_NULL, &mtx);
 }
 #endif
 #endif
diff --git a/include/jemalloc/internal/mutex.h b/include/jemalloc/internal/mutex.h
index 52217991..b442d2d4 100644
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@@ -5,6 +5,9 @@ typedef struct malloc_mutex_s malloc_mutex_t;
 
 #ifdef _WIN32
 #  define MALLOC_MUTEX_INITIALIZER
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+#  define MALLOC_MUTEX_INITIALIZER					\
+     {OS_UNFAIR_LOCK_INIT, WITNESS_INITIALIZER(WITNESS_RANK_OMIT)}
 #elif (defined(JEMALLOC_OSSPIN))
 #  define MALLOC_MUTEX_INITIALIZER {0, WITNESS_INITIALIZER(WITNESS_RANK_OMIT)}
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@@ -35,6 +38,8 @@ struct malloc_mutex_s {
 #  else
 	CRITICAL_SECTION	lock;
 #  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	os_unfair_lock		lock;
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLock		lock;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
@@ -88,6 +93,8 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex)
 #  else
 		EnterCriticalSection(&mutex->lock);
 #  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+		os_unfair_lock_lock(&mutex->lock);
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockLock(&mutex->lock);
 #else
@@ -109,6 +116,8 @@ malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex)
 #  else
 		LeaveCriticalSection(&mutex->lock);
 #  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+		os_unfair_lock_unlock(&mutex->lock);
 #elif (defined(JEMALLOC_OSSPIN))
 		OSSpinLockUnlock(&mutex->lock);
 #else
diff --git a/include/jemalloc/internal/nstime.h b/include/jemalloc/internal/nstime.h
index dc293b73..93b27dc8 100644
--- a/include/jemalloc/internal/nstime.h
+++ b/include/jemalloc/internal/nstime.h
@@ -1,9 +1,6 @@
 /******************************************************************************/
 #ifdef JEMALLOC_H_TYPES
 
-#define	JEMALLOC_CLOCK_GETTIME defined(_POSIX_MONOTONIC_CLOCK) \
-    && _POSIX_MONOTONIC_CLOCK >= 0
-
 typedef struct nstime_s nstime_t;
 
 /* Maximum supported number of seconds (~584 years). */
@@ -34,9 +31,12 @@ void	nstime_imultiply(nstime_t *time, uint64_t multiplier);
 void	nstime_idivide(nstime_t *time, uint64_t divisor);
 uint64_t	nstime_divide(const nstime_t *time, const nstime_t *divisor);
 #ifdef JEMALLOC_JET
+typedef bool (nstime_monotonic_t)(void);
+extern nstime_monotonic_t *nstime_monotonic;
 typedef bool (nstime_update_t)(nstime_t *);
 extern nstime_update_t *nstime_update;
 #else
+bool	nstime_monotonic(void);
 bool	nstime_update(nstime_t *time);
 #endif
 
diff --git a/include/jemalloc/internal/private_symbols.txt b/include/jemalloc/internal/private_symbols.txt
index f2b6a55d..8972b37b 100644
--- a/include/jemalloc/internal/private_symbols.txt
+++ b/include/jemalloc/internal/private_symbols.txt
@@ -1,4 +1,5 @@
 a0dalloc
+a0get
 a0malloc
 arena_aalloc
 arena_alloc_junk_small
@@ -167,20 +168,15 @@ chunk_dalloc_mmap
 chunk_dalloc_wrapper
 chunk_deregister
 chunk_dss_boot
-chunk_dss_postfork_child
-chunk_dss_postfork_parent
+chunk_dss_mergeable
 chunk_dss_prec_get
 chunk_dss_prec_set
-chunk_dss_prefork
 chunk_hooks_default
 chunk_hooks_get
 chunk_hooks_set
 chunk_in_dss
 chunk_lookup
 chunk_npages
-chunk_postfork_child
-chunk_postfork_parent
-chunk_prefork
 chunk_purge_wrapper
 chunk_register
 chunks_rtree
@@ -360,6 +356,7 @@ nstime_idivide
 nstime_imultiply
 nstime_init
 nstime_init2
+nstime_monotonic
 nstime_ns
 nstime_nsec
 nstime_sec
@@ -401,6 +398,10 @@ pages_map
 pages_purge
 pages_trim
 pages_unmap
+pind2sz
+pind2sz_compute
+pind2sz_lookup
+pind2sz_tab
 pow2_ceil_u32
 pow2_ceil_u64
 pow2_ceil_zu
@@ -454,12 +455,13 @@ prof_thread_active_init_set
 prof_thread_active_set
 prof_thread_name_get
 prof_thread_name_set
+psz2ind
+psz2u
 purge_mode_names
 quarantine
 quarantine_alloc_hook
 quarantine_alloc_hook_work
 quarantine_cleanup
-register_zone
 rtree_child_read
 rtree_child_read_hard
 rtree_child_tryread
@@ -477,7 +479,6 @@ rtree_val_read
 rtree_val_write
 run_quantize_ceil
 run_quantize_floor
-run_quantize_max
 s2u
 s2u_compute
 s2u_lookup
@@ -487,6 +488,8 @@ size2index
 size2index_compute
 size2index_lookup
 size2index_tab
+spin_adaptive
+spin_init
 stats_cactive
 stats_cactive_add
 stats_cactive_get
@@ -545,7 +548,9 @@ tsd_booted_get
 tsd_cleanup
 tsd_cleanup_wrapper
 tsd_fetch
+tsd_fetch_impl
 tsd_get
+tsd_get_allocates
 tsd_iarena_get
 tsd_iarena_set
 tsd_iarenap_get
@@ -604,9 +609,11 @@ witness_lock
 witness_lock_error
 witness_lockless_error
 witness_not_owner_error
+witness_owner
 witness_owner_error
 witness_postfork_child
 witness_postfork_parent
 witness_prefork
 witness_unlock
 witnesses_cleanup
+zone_register
diff --git a/include/jemalloc/internal/prof.h b/include/jemalloc/internal/prof.h
index 21dff5fb..8293b71e 100644
--- a/include/jemalloc/internal/prof.h
+++ b/include/jemalloc/internal/prof.h
@@ -299,9 +299,9 @@ extern prof_dump_header_t *prof_dump_header;
 void	prof_idump(tsdn_t *tsdn);
 bool	prof_mdump(tsd_t *tsd, const char *filename);
 void	prof_gdump(tsdn_t *tsdn);
-prof_tdata_t	*prof_tdata_init(tsdn_t *tsdn);
+prof_tdata_t	*prof_tdata_init(tsd_t *tsd);
 prof_tdata_t	*prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata);
-void	prof_reset(tsdn_t *tsdn, size_t lg_sample);
+void	prof_reset(tsd_t *tsd, size_t lg_sample);
 void	prof_tdata_cleanup(tsd_t *tsd);
 bool	prof_active_get(tsdn_t *tsdn);
 bool	prof_active_set(tsdn_t *tsdn, bool active);
@@ -315,7 +315,7 @@ bool	prof_gdump_get(tsdn_t *tsdn);
 bool	prof_gdump_set(tsdn_t *tsdn, bool active);
 void	prof_boot0(void);
 void	prof_boot1(void);
-bool	prof_boot2(tsdn_t *tsdn);
+bool	prof_boot2(tsd_t *tsd);
 void	prof_prefork0(tsdn_t *tsdn);
 void	prof_prefork1(tsdn_t *tsdn);
 void	prof_postfork_parent(tsdn_t *tsdn);
@@ -384,7 +384,7 @@ prof_tdata_get(tsd_t *tsd, bool create)
 	if (create) {
 		if (unlikely(tdata == NULL)) {
 			if (tsd_nominal(tsd)) {
-				tdata = prof_tdata_init(tsd_tsdn(tsd));
+				tdata = prof_tdata_init(tsd);
 				tsd_prof_tdata_set(tsd, tdata);
 			}
 		} else if (unlikely(tdata->expired)) {
diff --git a/include/jemalloc/internal/size_classes.sh b/include/jemalloc/internal/size_classes.sh
index 2b0ca29a..f6fbce4e 100755
--- a/include/jemalloc/internal/size_classes.sh
+++ b/include/jemalloc/internal/size_classes.sh
@@ -48,6 +48,21 @@ size_class() {
   lg_p=$5
   lg_kmax=$6
 
+  if [ ${lg_delta} -ge ${lg_p} ] ; then
+    psz="yes"
+  else
+    pow2 ${lg_p}; p=${pow2_result}
+    pow2 ${lg_grp}; grp=${pow2_result}
+    pow2 ${lg_delta}; delta=${pow2_result}
+    sz=$((${grp} + ${delta} * ${ndelta}))
+    npgs=$((${sz} / ${p}))
+    if [ ${sz} -eq $((${npgs} * ${p})) ] ; then
+      psz="yes"
+    else
+      psz="no"
+    fi
+  fi
+
   lg ${ndelta}; lg_ndelta=${lg_result}; pow2 ${lg_ndelta}
   if [ ${pow2_result} -lt ${ndelta} ] ; then
     rem="yes"
@@ -74,14 +89,15 @@ size_class() {
   else
     lg_delta_lookup="no"
   fi
-  printf '    SC(%3d, %6d, %8d, %6d, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${bin} ${lg_delta_lookup}
+  printf '    SC(%3d, %6d, %8d, %6d, %3s, %3s, %2s) \\\n' ${index} ${lg_grp} ${lg_delta} ${ndelta} ${psz} ${bin} ${lg_delta_lookup}
   # Defined upon return:
-  # - lg_delta_lookup (${lg_delta} or "no")
+  # - psz ("yes" or "no")
   # - bin ("yes" or "no")
+  # - lg_delta_lookup (${lg_delta} or "no")
 }
 
 sep_line() {
-  echo "                                               \\"
+  echo "                                                         \\"
 }
 
 size_classes() {
@@ -95,12 +111,13 @@ size_classes() {
   pow2 ${lg_g}; g=${pow2_result}
 
   echo "#define	SIZE_CLASSES \\"
-  echo "  /* index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup */ \\"
+  echo "  /* index, lg_grp, lg_delta, ndelta, psz, bin, lg_delta_lookup */ \\"
 
   ntbins=0
   nlbins=0
   lg_tiny_maxclass='"NA"'
   nbins=0
+  npsizes=0
 
   # Tiny size classes.
   ndelta=0
@@ -112,6 +129,9 @@ size_classes() {
     if [ ${lg_delta_lookup} != "no" ] ; then
       nlbins=$((${index} + 1))
     fi
+    if [ ${psz} = "yes" ] ; then
+      npsizes=$((${npsizes} + 1))
+    fi
     if [ ${bin} != "no" ] ; then
       nbins=$((${index} + 1))
     fi
@@ -133,11 +153,17 @@ size_classes() {
     index=$((${index} + 1))
     lg_grp=$((${lg_grp} + 1))
     lg_delta=$((${lg_delta} + 1))
+    if [ ${psz} = "yes" ] ; then
+      npsizes=$((${npsizes} + 1))
+    fi
   fi
   while [ ${ndelta} -lt ${g} ] ; do
     size_class ${index} ${lg_grp} ${lg_delta} ${ndelta} ${lg_p} ${lg_kmax}
     index=$((${index} + 1))
     ndelta=$((${ndelta} + 1))
+    if [ ${psz} = "yes" ] ; then
+      npsizes=$((${npsizes} + 1))
+    fi
   done
 
   # All remaining groups.
@@ -157,6 +183,9 @@ size_classes() {
         # Final written value is correct:
         lookup_maxclass="((((size_t)1) << ${lg_grp}) + (((size_t)${ndelta}) << ${lg_delta}))"
       fi
+      if [ ${psz} = "yes" ] ; then
+        npsizes=$((${npsizes} + 1))
+      fi
       if [ ${bin} != "no" ] ; then
         nbins=$((${index} + 1))
         # Final written value is correct:
@@ -183,6 +212,7 @@ size_classes() {
   # - nlbins
   # - nbins
   # - nsizes
+  # - npsizes
   # - lg_tiny_maxclass
   # - lookup_maxclass
   # - small_maxclass
@@ -200,13 +230,13 @@ cat <<EOF
  * be defined prior to inclusion, and it in turn defines:
  *
  *   LG_SIZE_CLASS_GROUP: Lg of size class count for each size doubling.
- *   SIZE_CLASSES: Complete table of
- *                 SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)
- *                 tuples.
+ *   SIZE_CLASSES: Complete table of SC(index, lg_grp, lg_delta, ndelta, psz,
+ *                 bin, lg_delta_lookup) tuples.
  *     index: Size class index.
  *     lg_grp: Lg group base size (no deltas added).
  *     lg_delta: Lg delta to previous size class.
  *     ndelta: Delta multiplier.  size == 1<<lg_grp + ndelta<<lg_delta
+ *     psz: 'yes' if a multiple of the page size, 'no' otherwise.
  *     bin: 'yes' if a small bin size class, 'no' otherwise.
  *     lg_delta_lookup: Same as lg_delta if a lookup table size class, 'no'
  *                      otherwise.
@@ -214,6 +244,7 @@ cat <<EOF
  *   NLBINS: Number of bins supported by the lookup table.
  *   NBINS: Number of small size class bins.
  *   NSIZES: Number of size classes.
+ *   NPSIZES: Number of size classes that are a multiple of (1U << LG_PAGE).
  *   LG_TINY_MAXCLASS: Lg of maximum tiny size class.
  *   LOOKUP_MAXCLASS: Maximum size class included in lookup table.
  *   SMALL_MAXCLASS: Maximum small size class.
@@ -238,6 +269,7 @@ for lg_z in ${lg_zarr} ; do
         echo "#define	NLBINS			${nlbins}"
         echo "#define	NBINS			${nbins}"
         echo "#define	NSIZES			${nsizes}"
+        echo "#define	NPSIZES			${npsizes}"
         echo "#define	LG_TINY_MAXCLASS	${lg_tiny_maxclass}"
         echo "#define	LOOKUP_MAXCLASS		${lookup_maxclass}"
         echo "#define	SMALL_MAXCLASS		${small_maxclass}"
diff --git a/include/jemalloc/internal/spin.h b/include/jemalloc/internal/spin.h
new file mode 100644
index 00000000..9ef5ceb9
--- /dev/null
+++ b/include/jemalloc/internal/spin.h
@@ -0,0 +1,51 @@
+/******************************************************************************/
+#ifdef JEMALLOC_H_TYPES
+
+typedef struct spin_s spin_t;
+
+#endif /* JEMALLOC_H_TYPES */
+/******************************************************************************/
+#ifdef JEMALLOC_H_STRUCTS
+
+struct spin_s {
+	unsigned iteration;
+};
+
+#endif /* JEMALLOC_H_STRUCTS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_EXTERNS
+
+#endif /* JEMALLOC_H_EXTERNS */
+/******************************************************************************/
+#ifdef JEMALLOC_H_INLINES
+
+#ifndef JEMALLOC_ENABLE_INLINE
+void	spin_init(spin_t *spin);
+void	spin_adaptive(spin_t *spin);
+#endif
+
+#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_SPIN_C_))
+JEMALLOC_INLINE void
+spin_init(spin_t *spin)
+{
+
+	spin->iteration = 0;
+}
+
+JEMALLOC_INLINE void
+spin_adaptive(spin_t *spin)
+{
+	volatile uint64_t i;
+
+	for (i = 0; i < (KQU(1) << spin->iteration); i++)
+		CPU_SPINWAIT;
+
+	if (spin->iteration < 63)
+		spin->iteration++;
+}
+
+#endif
+
+#endif /* JEMALLOC_H_INLINES */
+/******************************************************************************/
+
diff --git a/include/jemalloc/internal/tcache.h b/include/jemalloc/internal/tcache.h
index 70883b1a..01ba062d 100644
--- a/include/jemalloc/internal/tcache.h
+++ b/include/jemalloc/internal/tcache.h
@@ -145,7 +145,7 @@ tcache_t *tcache_create(tsdn_t *tsdn, arena_t *arena);
 void	tcache_cleanup(tsd_t *tsd);
 void	tcache_enabled_cleanup(tsd_t *tsd);
 void	tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena);
-bool	tcaches_create(tsdn_t *tsdn, unsigned *r_ind);
+bool	tcaches_create(tsd_t *tsd, unsigned *r_ind);
 void	tcaches_flush(tsd_t *tsd, unsigned ind);
 void	tcaches_destroy(tsd_t *tsd, unsigned ind);
 bool	tcache_boot(tsdn_t *tsdn);
diff --git a/include/jemalloc/internal/tsd.h b/include/jemalloc/internal/tsd.h
index bf113411..9055acaf 100644
--- a/include/jemalloc/internal/tsd.h
+++ b/include/jemalloc/internal/tsd.h
@@ -48,7 +48,7 @@ typedef enum {
  *
  *   bool example_tsd_boot(void) {...}
  *   bool example_tsd_booted_get(void) {...}
- *   example_t *example_tsd_get() {...}
+ *   example_t *example_tsd_get(bool init) {...}
  *   void example_tsd_set(example_t *val) {...}
  *
  * Note that all of the functions deal in terms of (a_type *) rather than
@@ -105,7 +105,7 @@ a_name##tsd_boot(void);							\
 a_attr bool								\
 a_name##tsd_booted_get(void);						\
 a_attr a_type *								\
-a_name##tsd_get(void);							\
+a_name##tsd_get(bool init);						\
 a_attr void								\
 a_name##tsd_set(a_type *val);
 
@@ -213,9 +213,15 @@ a_name##tsd_booted_get(void)						\
 									\
 	return (a_name##tsd_booted);					\
 }									\
+a_attr bool								\
+a_name##tsd_get_allocates(void)						\
+{									\
+									\
+	return (false);							\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##tsd_get(void)							\
+a_name##tsd_get(bool init)						\
 {									\
 									\
 	assert(a_name##tsd_booted);					\
@@ -264,9 +270,15 @@ a_name##tsd_booted_get(void)						\
 									\
 	return (a_name##tsd_booted);					\
 }									\
+a_attr bool								\
+a_name##tsd_get_allocates(void)						\
+{									\
+									\
+	return (false);							\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##tsd_get(void)							\
+a_name##tsd_get(bool init)						\
 {									\
 									\
 	assert(a_name##tsd_booted);					\
@@ -325,14 +337,14 @@ a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 	}								\
 }									\
 a_attr a_name##tsd_wrapper_t *						\
-a_name##tsd_wrapper_get(void)						\
+a_name##tsd_wrapper_get(bool init)					\
 {									\
 	DWORD error = GetLastError();					\
 	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
 	    TlsGetValue(a_name##tsd_tsd);				\
 	SetLastError(error);						\
 									\
-	if (unlikely(wrapper == NULL)) {				\
+	if (init && unlikely(wrapper == NULL)) {			\
 		wrapper = (a_name##tsd_wrapper_t *)			\
 		    malloc_tsd_malloc(sizeof(a_name##tsd_wrapper_t));	\
 		if (wrapper == NULL) {					\
@@ -392,14 +404,22 @@ a_name##tsd_booted_get(void)						\
 									\
 	return (a_name##tsd_booted);					\
 }									\
+a_attr bool								\
+a_name##tsd_get_allocates(void)						\
+{									\
+									\
+	return (true);							\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##tsd_get(void)							\
+a_name##tsd_get(bool init)						\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_wrapper_get();				\
+	wrapper = a_name##tsd_wrapper_get(init);			\
+	if (a_name##tsd_get_allocates() && !init && wrapper == NULL)	\
+		return (NULL);						\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
@@ -408,7 +428,7 @@ a_name##tsd_set(a_type *val)						\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_wrapper_get();				\
+	wrapper = a_name##tsd_wrapper_get(true);			\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -452,12 +472,12 @@ a_name##tsd_wrapper_set(a_name##tsd_wrapper_t *wrapper)			\
 	}								\
 }									\
 a_attr a_name##tsd_wrapper_t *						\
-a_name##tsd_wrapper_get(void)						\
+a_name##tsd_wrapper_get(bool init)					\
 {									\
 	a_name##tsd_wrapper_t *wrapper = (a_name##tsd_wrapper_t *)	\
 	    pthread_getspecific(a_name##tsd_tsd);			\
 									\
-	if (unlikely(wrapper == NULL)) {				\
+	if (init && unlikely(wrapper == NULL)) {			\
 		tsd_init_block_t block;					\
 		wrapper = tsd_init_check_recursion(			\
 		    &a_name##tsd_init_head, &block);			\
@@ -520,14 +540,22 @@ a_name##tsd_booted_get(void)						\
 									\
 	return (a_name##tsd_booted);					\
 }									\
+a_attr bool								\
+a_name##tsd_get_allocates(void)						\
+{									\
+									\
+	return (true);							\
+}									\
 /* Get/set. */								\
 a_attr a_type *								\
-a_name##tsd_get(void)							\
+a_name##tsd_get(bool init)						\
 {									\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_wrapper_get();				\
+	wrapper = a_name##tsd_wrapper_get(init);			\
+	if (a_name##tsd_get_allocates() && !init && wrapper == NULL)	\
+		return (NULL);						\
 	return (&wrapper->val);						\
 }									\
 a_attr void								\
@@ -536,7 +564,7 @@ a_name##tsd_set(a_type *val)						\
 	a_name##tsd_wrapper_t *wrapper;					\
 									\
 	assert(a_name##tsd_booted);					\
-	wrapper = a_name##tsd_wrapper_get();				\
+	wrapper = a_name##tsd_wrapper_get(true);			\
 	wrapper->val = *(val);						\
 	if (a_cleanup != malloc_tsd_no_cleanup)				\
 		wrapper->initialized = true;				\
@@ -639,6 +667,7 @@ void	tsd_cleanup(void *arg);
 #ifndef JEMALLOC_ENABLE_INLINE
 malloc_tsd_protos(JEMALLOC_ATTR(unused), , tsd_t)
 
+tsd_t	*tsd_fetch_impl(bool init);
 tsd_t	*tsd_fetch(void);
 tsdn_t	*tsd_tsdn(tsd_t *tsd);
 bool	tsd_nominal(tsd_t *tsd);
@@ -658,9 +687,13 @@ malloc_tsd_externs(, tsd_t)
 malloc_tsd_funcs(JEMALLOC_ALWAYS_INLINE, , tsd_t, tsd_initializer, tsd_cleanup)
 
 JEMALLOC_ALWAYS_INLINE tsd_t *
-tsd_fetch(void)
+tsd_fetch_impl(bool init)
 {
-	tsd_t *tsd = tsd_get();
+	tsd_t *tsd = tsd_get(init);
+
+	if (!init && tsd_get_allocates() && tsd == NULL)
+		return (NULL);
+	assert(tsd != NULL);
 
 	if (unlikely(tsd->state != tsd_state_nominal)) {
 		if (tsd->state == tsd_state_uninitialized) {
@@ -677,6 +710,13 @@ tsd_fetch(void)
 	return (tsd);
 }
 
+JEMALLOC_ALWAYS_INLINE tsd_t *
+tsd_fetch(void)
+{
+
+	return (tsd_fetch_impl(true));
+}
+
 JEMALLOC_ALWAYS_INLINE tsdn_t *
 tsd_tsdn(tsd_t *tsd)
 {
@@ -723,7 +763,7 @@ tsdn_fetch(void)
 	if (!tsd_booted_get())
 		return (NULL);
 
-	return (tsd_tsdn(tsd_fetch()));
+	return (tsd_tsdn(tsd_fetch_impl(false)));
 }
 
 JEMALLOC_ALWAYS_INLINE bool
diff --git a/include/jemalloc/internal/util.h b/include/jemalloc/internal/util.h
index a0c2203d..aee00d6d 100644
--- a/include/jemalloc/internal/util.h
+++ b/include/jemalloc/internal/util.h
@@ -61,30 +61,20 @@
 #	define JEMALLOC_CC_SILENCE_INIT(v)
 #endif
 
-#define	JEMALLOC_GNUC_PREREQ(major, minor)				\
-    (!defined(__clang__) &&						\
-    (__GNUC__ > (major) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))))
-#ifndef __has_builtin
-#  define __has_builtin(builtin) (0)
-#endif
-#define	JEMALLOC_CLANG_HAS_BUILTIN(builtin)				\
-    (defined(__clang__) && __has_builtin(builtin))
-
 #ifdef __GNUC__
 #	define likely(x)   __builtin_expect(!!(x), 1)
 #	define unlikely(x) __builtin_expect(!!(x), 0)
-#  if JEMALLOC_GNUC_PREREQ(4, 6) ||					\
-      JEMALLOC_CLANG_HAS_BUILTIN(__builtin_unreachable)
-#	define unreachable() __builtin_unreachable()
-#  else
-#	define unreachable() abort()
-#  endif
 #else
 #	define likely(x)   !!(x)
 #	define unlikely(x) !!(x)
-#	define unreachable() abort()
 #endif
 
+#if !defined(JEMALLOC_INTERNAL_UNREACHABLE)
+#  error JEMALLOC_INTERNAL_UNREACHABLE should have been defined by configure
+#endif
+
+#define unreachable() JEMALLOC_INTERNAL_UNREACHABLE()
+
 #include "jemalloc/internal/assert.h"
 
 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
diff --git a/include/jemalloc/internal/witness.h b/include/jemalloc/internal/witness.h
index d78dca2d..cdf15d79 100644
--- a/include/jemalloc/internal/witness.h
+++ b/include/jemalloc/internal/witness.h
@@ -108,6 +108,7 @@ void	witness_postfork_child(tsd_t *tsd);
 #ifdef JEMALLOC_H_INLINES
 
 #ifndef JEMALLOC_ENABLE_INLINE
+bool	witness_owner(tsd_t *tsd, const witness_t *witness);
 void	witness_assert_owner(tsdn_t *tsdn, const witness_t *witness);
 void	witness_assert_not_owner(tsdn_t *tsdn, const witness_t *witness);
 void	witness_assert_lockless(tsdn_t *tsdn);
@@ -116,12 +117,25 @@ void	witness_unlock(tsdn_t *tsdn, witness_t *witness);
 #endif
 
 #if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_MUTEX_C_))
+JEMALLOC_INLINE bool
+witness_owner(tsd_t *tsd, const witness_t *witness)
+{
+	witness_list_t *witnesses;
+	witness_t *w;
+
+	witnesses = tsd_witnessesp_get(tsd);
+	ql_foreach(w, witnesses, link) {
+		if (w == witness)
+			return (true);
+	}
+
+	return (false);
+}
+
 JEMALLOC_INLINE void
 witness_assert_owner(tsdn_t *tsdn, const witness_t *witness)
 {
 	tsd_t *tsd;
-	witness_list_t *witnesses;
-	witness_t *w;
 
 	if (!config_debug)
 		return;
@@ -132,11 +146,8 @@ witness_assert_owner(tsdn_t *tsdn, const witness_t *witness)
 	if (witness->rank == WITNESS_RANK_OMIT)
 		return;
 
-	witnesses = tsd_witnessesp_get(tsd);
-	ql_foreach(w, witnesses, link) {
-		if (w == witness)
-			return;
-	}
+	if (witness_owner(tsd, witness))
+		return;
 	witness_owner_error(witness);
 }
 
@@ -238,10 +249,16 @@ witness_unlock(tsdn_t *tsdn, witness_t *witness)
 	if (witness->rank == WITNESS_RANK_OMIT)
 		return;
 
-	witness_assert_owner(tsdn, witness);
-
-	witnesses = tsd_witnessesp_get(tsd);
-	ql_remove(witnesses, witness, link);
+	/*
+	 * Check whether owner before removal, rather than relying on
+	 * witness_assert_owner() to abort, so that unit tests can test this
+	 * function's failure mode without causing undefined behavior.
+	 */
+	if (witness_owner(tsd, witness)) {
+		witnesses = tsd_witnessesp_get(tsd);
+		ql_remove(witnesses, witness, link);
+	} else
+		witness_assert_owner(tsdn, witness);
 }
 #endif
 
diff --git a/jemalloc.pc.in b/jemalloc.pc.in
index 1a3ad9b3..a318e8dd 100644
--- a/jemalloc.pc.in
+++ b/jemalloc.pc.in
@@ -6,7 +6,7 @@ install_suffix=@install_suffix@
 
 Name: jemalloc
 Description: A general purpose malloc(3) implementation that emphasizes fragmentation avoidance and scalable concurrency support.
-URL: http://www.canonware.com/jemalloc
+URL: http://jemalloc.net/
 Version: @jemalloc_version@
 Cflags: -I${includedir}
 Libs: -L${libdir} -ljemalloc${install_suffix}
diff --git a/msvc/ReadMe.txt b/msvc/ReadMe.txt
index 02b97f74..77d567da 100644
--- a/msvc/ReadMe.txt
+++ b/msvc/ReadMe.txt
@@ -17,7 +17,7 @@ How to build jemalloc for Windows
    (note: x86/x64 doesn't matter at this point)
 
 5. Generate header files:
-   sh -c "./autogen.sh CC=cl --enable-lazy-lock=no"
+   sh -c "CC=cl ./autogen.sh"
 
 6. Now the project can be opened and built in Visual Studio:
    msvc\jemalloc_vc2015.sln
diff --git a/src/arena.c b/src/arena.c
index ce62590b..43c3ccf2 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -21,15 +21,8 @@ size_t		map_bias;
 size_t		map_misc_offset;
 size_t		arena_maxrun; /* Max run size for arenas. */
 size_t		large_maxclass; /* Max large size class. */
-size_t		run_quantize_max; /* Max run_quantize_*() input. */
-static size_t	small_maxrun; /* Max run size for small size classes. */
-static bool	*small_run_tab; /* Valid small run page multiples. */
-static size_t	*run_quantize_floor_tab; /* run_quantize_floor() memoization. */
-static size_t	*run_quantize_ceil_tab; /* run_quantize_ceil() memoization. */
 unsigned	nlclasses; /* Number of large size classes. */
 unsigned	nhclasses; /* Number of huge size classes. */
-static szind_t	runs_avail_bias; /* Size index for first runs_avail tree. */
-static szind_t	runs_avail_nclasses; /* Number of runs_avail trees. */
 
 /******************************************************************************/
 /*
@@ -37,6 +30,8 @@ static szind_t	runs_avail_nclasses; /* Number of runs_avail trees. */
  * definition.
  */
 
+static void	arena_chunk_dalloc(tsdn_t *tsdn, arena_t *arena,
+    arena_chunk_t *chunk);
 static void	arena_purge_to_limit(tsdn_t *tsdn, arena_t *arena,
     size_t ndirty_limit);
 static void	arena_run_dalloc(tsdn_t *tsdn, arena_t *arena, arena_run_t *run,
@@ -77,83 +72,6 @@ arena_run_addr_comp(const arena_chunk_map_misc_t *a,
 ph_gen(static UNUSED, arena_run_heap_, arena_run_heap_t, arena_chunk_map_misc_t,
     ph_link, arena_run_addr_comp)
 
-static size_t
-run_quantize_floor_compute(size_t size)
-{
-	size_t qsize;
-
-	assert(size != 0);
-	assert(size == PAGE_CEILING(size));
-
-	/* Don't change sizes that are valid small run sizes. */
-	if (size <= small_maxrun && small_run_tab[size >> LG_PAGE])
-		return (size);
-
-	/*
-	 * Round down to the nearest run size that can actually be requested
-	 * during normal large allocation.  Add large_pad so that cache index
-	 * randomization can offset the allocation from the page boundary.
-	 */
-	qsize = index2size(size2index(size - large_pad + 1) - 1) + large_pad;
-	if (qsize <= SMALL_MAXCLASS + large_pad)
-		return (run_quantize_floor_compute(size - large_pad));
-	assert(qsize <= size);
-	return (qsize);
-}
-
-static size_t
-run_quantize_ceil_compute_hard(size_t size)
-{
-	size_t large_run_size_next;
-
-	assert(size != 0);
-	assert(size == PAGE_CEILING(size));
-
-	/*
-	 * Return the next quantized size greater than the input size.
-	 * Quantized sizes comprise the union of run sizes that back small
-	 * region runs, and run sizes that back large regions with no explicit
-	 * alignment constraints.
-	 */
-
-	if (size > SMALL_MAXCLASS) {
-		large_run_size_next = PAGE_CEILING(index2size(size2index(size -
-		    large_pad) + 1) + large_pad);
-	} else
-		large_run_size_next = SIZE_T_MAX;
-	if (size >= small_maxrun)
-		return (large_run_size_next);
-
-	while (true) {
-		size += PAGE;
-		assert(size <= small_maxrun);
-		if (small_run_tab[size >> LG_PAGE]) {
-			if (large_run_size_next < size)
-				return (large_run_size_next);
-			return (size);
-		}
-	}
-}
-
-static size_t
-run_quantize_ceil_compute(size_t size)
-{
-	size_t qsize = run_quantize_floor_compute(size);
-
-	if (qsize < size) {
-		/*
-		 * Skip a quantization that may have an adequately large run,
-		 * because under-sized runs may be mixed in.  This only happens
-		 * when an unusual size is requested, i.e. for aligned
-		 * allocation, and is just one of several places where linear
-		 * search would potentially find sufficiently aligned available
-		 * memory somewhere lower.
-		 */
-		qsize = run_quantize_ceil_compute_hard(qsize);
-	}
-	return (qsize);
-}
-
 #ifdef JEMALLOC_JET
 #undef run_quantize_floor
 #define	run_quantize_floor JEMALLOC_N(n_run_quantize_floor)
@@ -162,13 +80,27 @@ static size_t
 run_quantize_floor(size_t size)
 {
 	size_t ret;
+	pszind_t pind;
 
 	assert(size > 0);
-	assert(size <= run_quantize_max);
+	assert(size <= HUGE_MAXCLASS);
 	assert((size & PAGE_MASK) == 0);
 
-	ret = run_quantize_floor_tab[(size >> LG_PAGE) - 1];
-	assert(ret == run_quantize_floor_compute(size));
+	assert(size != 0);
+	assert(size == PAGE_CEILING(size));
+
+	pind = psz2ind(size - large_pad + 1);
+	if (pind == 0) {
+		/*
+		 * Avoid underflow.  This short-circuit would also do the right
+		 * thing for all sizes in the range for which there are
+		 * PAGE-spaced size classes, but it's simplest to just handle
+		 * the one case that would cause erroneous results.
+		 */
+		return (size);
+	}
+	ret = pind2sz(pind - 1) + large_pad;
+	assert(ret <= size);
 	return (ret);
 }
 #ifdef JEMALLOC_JET
@@ -187,11 +119,21 @@ run_quantize_ceil(size_t size)
 	size_t ret;
 
 	assert(size > 0);
-	assert(size <= run_quantize_max);
+	assert(size <= HUGE_MAXCLASS);
 	assert((size & PAGE_MASK) == 0);
 
-	ret = run_quantize_ceil_tab[(size >> LG_PAGE) - 1];
-	assert(ret == run_quantize_ceil_compute(size));
+	ret = run_quantize_floor(size);
+	if (ret < size) {
+		/*
+		 * Skip a quantization that may have an adequately large run,
+		 * because under-sized runs may be mixed in.  This only happens
+		 * when an unusual size is requested, i.e. for aligned
+		 * allocation, and is just one of several places where linear
+		 * search would potentially find sufficiently aligned available
+		 * memory somewhere lower.
+		 */
+		ret = pind2sz(psz2ind(ret - large_pad + 1)) + large_pad;
+	}
 	return (ret);
 }
 #ifdef JEMALLOC_JET
@@ -200,25 +142,15 @@ run_quantize_ceil(size_t size)
 run_quantize_t *run_quantize_ceil = JEMALLOC_N(n_run_quantize_ceil);
 #endif
 
-static arena_run_heap_t *
-arena_runs_avail_get(arena_t *arena, szind_t ind)
-{
-
-	assert(ind >= runs_avail_bias);
-	assert(ind - runs_avail_bias < runs_avail_nclasses);
-
-	return (&arena->runs_avail[ind - runs_avail_bias]);
-}
-
 static void
 arena_avail_insert(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-	szind_t ind = size2index(run_quantize_floor(arena_miscelm_size_get(
+	pszind_t pind = psz2ind(run_quantize_floor(arena_miscelm_size_get(
 	    arena_miscelm_get_const(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_run_heap_insert(arena_runs_avail_get(arena, ind),
+	arena_run_heap_insert(&arena->runs_avail[pind],
 	    arena_miscelm_get_mutable(chunk, pageind));
 }
 
@@ -226,11 +158,11 @@ static void
 arena_avail_remove(arena_t *arena, arena_chunk_t *chunk, size_t pageind,
     size_t npages)
 {
-	szind_t ind = size2index(run_quantize_floor(arena_miscelm_size_get(
+	pszind_t pind = psz2ind(run_quantize_floor(arena_miscelm_size_get(
 	    arena_miscelm_get_const(chunk, pageind))));
 	assert(npages == (arena_mapbits_unallocated_size_get(chunk, pageind) >>
 	    LG_PAGE));
-	arena_run_heap_remove(arena_runs_avail_get(arena, ind),
+	arena_run_heap_remove(&arena->runs_avail[pind],
 	    arena_miscelm_get_mutable(chunk, pageind));
 }
 
@@ -649,14 +581,13 @@ arena_chunk_alloc_internal(tsdn_t *tsdn, arena_t *arena, bool *zero,
 	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 
 	chunk = chunk_alloc_cache(tsdn, arena, &chunk_hooks, NULL, chunksize,
-	    chunksize, zero, true);
+	    chunksize, zero, commit, true);
 	if (chunk != NULL) {
 		if (arena_chunk_register(tsdn, arena, chunk, *zero)) {
 			chunk_dalloc_cache(tsdn, arena, &chunk_hooks, chunk,
 			    chunksize, true);
 			return (NULL);
 		}
-		*commit = true;
 	}
 	if (chunk == NULL) {
 		chunk = arena_chunk_alloc_internal_hard(tsdn, arena,
@@ -953,6 +884,7 @@ arena_chunk_alloc_huge(tsdn_t *tsdn, arena_t *arena, size_t usize,
 	void *ret;
 	chunk_hooks_t chunk_hooks = CHUNK_HOOKS_INITIALIZER;
 	size_t csize = CHUNK_CEILING(usize);
+	bool commit = true;
 
 	malloc_mutex_lock(tsdn, &arena->lock);
 
@@ -964,7 +896,7 @@ arena_chunk_alloc_huge(tsdn_t *tsdn, arena_t *arena, size_t usize,
 	arena_nactive_add(arena, usize >> LG_PAGE);
 
 	ret = chunk_alloc_cache(tsdn, arena, &chunk_hooks, NULL, csize,
-	    alignment, zero, true);
+	    alignment, zero, &commit, true);
 	malloc_mutex_unlock(tsdn, &arena->lock);
 	if (ret == NULL) {
 		ret = arena_chunk_alloc_huge_hard(tsdn, arena, &chunk_hooks,
@@ -1074,6 +1006,7 @@ arena_chunk_ralloc_huge_expand(tsdn_t *tsdn, arena_t *arena, void *chunk,
 	void *nchunk = (void *)((uintptr_t)chunk + CHUNK_CEILING(oldsize));
 	size_t udiff = usize - oldsize;
 	size_t cdiff = CHUNK_CEILING(usize) - CHUNK_CEILING(oldsize);
+	bool commit = true;
 
 	malloc_mutex_lock(tsdn, &arena->lock);
 
@@ -1085,7 +1018,7 @@ arena_chunk_ralloc_huge_expand(tsdn_t *tsdn, arena_t *arena, void *chunk,
 	arena_nactive_add(arena, udiff >> LG_PAGE);
 
 	err = (chunk_alloc_cache(tsdn, arena, &chunk_hooks, nchunk, cdiff,
-	    chunksize, zero, true) == NULL);
+	    chunksize, zero, &commit, true) == NULL);
 	malloc_mutex_unlock(tsdn, &arena->lock);
 	if (err) {
 		err = arena_chunk_ralloc_huge_expand_hard(tsdn, arena,
@@ -1109,12 +1042,13 @@ arena_chunk_ralloc_huge_expand(tsdn_t *tsdn, arena_t *arena, void *chunk,
 static arena_run_t *
 arena_run_first_best_fit(arena_t *arena, size_t size)
 {
-	szind_t ind, i;
+	pszind_t pind, i;
 
-	ind = size2index(run_quantize_ceil(size));
-	for (i = ind; i < runs_avail_nclasses + runs_avail_bias; i++) {
+	pind = psz2ind(run_quantize_ceil(size));
+
+	for (i = pind; pind2sz(i) <= large_maxclass; i++) {
 		arena_chunk_map_misc_t *miscelm = arena_run_heap_first(
-		    arena_runs_avail_get(arena, i));
+		    &arena->runs_avail[i]);
 		if (miscelm != NULL)
 			return (&miscelm->run);
 	}
@@ -1125,7 +1059,7 @@ arena_run_first_best_fit(arena_t *arena, size_t size)
 static arena_run_t *
 arena_run_alloc_large_helper(arena_t *arena, size_t size, bool zero)
 {
-	arena_run_t *run = arena_run_first_best_fit(arena, s2u(size));
+	arena_run_t *run = arena_run_first_best_fit(arena, size);
 	if (run != NULL) {
 		if (arena_run_split_large(arena, run, size, zero))
 			run = NULL;
@@ -1256,14 +1190,14 @@ arena_decay_deadline_init(arena_t *arena)
 	 * Generate a new deadline that is uniformly random within the next
 	 * epoch after the current one.
 	 */
-	nstime_copy(&arena->decay_deadline, &arena->decay_epoch);
-	nstime_add(&arena->decay_deadline, &arena->decay_interval);
-	if (arena->decay_time > 0) {
+	nstime_copy(&arena->decay.deadline, &arena->decay.epoch);
+	nstime_add(&arena->decay.deadline, &arena->decay.interval);
+	if (arena->decay.time > 0) {
 		nstime_t jitter;
 
-		nstime_init(&jitter, prng_range(&arena->decay_jitter_state,
-		    nstime_ns(&arena->decay_interval)));
-		nstime_add(&arena->decay_deadline, &jitter);
+		nstime_init(&jitter, prng_range(&arena->decay.jitter_state,
+		    nstime_ns(&arena->decay.interval)));
+		nstime_add(&arena->decay.deadline, &jitter);
 	}
 }
 
@@ -1273,7 +1207,7 @@ arena_decay_deadline_reached(const arena_t *arena, const nstime_t *time)
 
 	assert(opt_purge == purge_mode_decay);
 
-	return (nstime_compare(&arena->decay_deadline, time) <= 0);
+	return (nstime_compare(&arena->decay.deadline, time) <= 0);
 }
 
 static size_t
@@ -1298,92 +1232,103 @@ arena_decay_backlog_npages_limit(const arena_t *arena)
 	 */
 	sum = 0;
 	for (i = 0; i < SMOOTHSTEP_NSTEPS; i++)
-		sum += arena->decay_backlog[i] * h_steps[i];
+		sum += arena->decay.backlog[i] * h_steps[i];
 	npages_limit_backlog = (size_t)(sum >> SMOOTHSTEP_BFP);
 
 	return (npages_limit_backlog);
 }
 
 static void
-arena_decay_epoch_advance(arena_t *arena, const nstime_t *time)
+arena_decay_backlog_update_last(arena_t *arena)
 {
-	uint64_t nadvance_u64;
-	nstime_t delta;
-	size_t ndirty_delta;
+	size_t ndirty_delta = (arena->ndirty > arena->decay.ndirty) ?
+	    arena->ndirty - arena->decay.ndirty : 0;
+	arena->decay.backlog[SMOOTHSTEP_NSTEPS-1] = ndirty_delta;
+}
 
-	assert(opt_purge == purge_mode_decay);
-	assert(arena_decay_deadline_reached(arena, time));
+static void
+arena_decay_backlog_update(arena_t *arena, uint64_t nadvance_u64)
+{
 
-	nstime_copy(&delta, time);
-	nstime_subtract(&delta, &arena->decay_epoch);
-	nadvance_u64 = nstime_divide(&delta, &arena->decay_interval);
-	assert(nadvance_u64 > 0);
-
-	/* Add nadvance_u64 decay intervals to epoch. */
-	nstime_copy(&delta, &arena->decay_interval);
-	nstime_imultiply(&delta, nadvance_u64);
-	nstime_add(&arena->decay_epoch, &delta);
-
-	/* Set a new deadline. */
-	arena_decay_deadline_init(arena);
-
-	/* Update the backlog. */
 	if (nadvance_u64 >= SMOOTHSTEP_NSTEPS) {
-		memset(arena->decay_backlog, 0, (SMOOTHSTEP_NSTEPS-1) *
+		memset(arena->decay.backlog, 0, (SMOOTHSTEP_NSTEPS-1) *
 		    sizeof(size_t));
 	} else {
 		size_t nadvance_z = (size_t)nadvance_u64;
 
 		assert((uint64_t)nadvance_z == nadvance_u64);
 
-		memmove(arena->decay_backlog, &arena->decay_backlog[nadvance_z],
+		memmove(arena->decay.backlog, &arena->decay.backlog[nadvance_z],
 		    (SMOOTHSTEP_NSTEPS - nadvance_z) * sizeof(size_t));
 		if (nadvance_z > 1) {
-			memset(&arena->decay_backlog[SMOOTHSTEP_NSTEPS -
+			memset(&arena->decay.backlog[SMOOTHSTEP_NSTEPS -
 			    nadvance_z], 0, (nadvance_z-1) * sizeof(size_t));
 		}
 	}
-	ndirty_delta = (arena->ndirty > arena->decay_ndirty) ? arena->ndirty -
-	    arena->decay_ndirty : 0;
-	arena->decay_ndirty = arena->ndirty;
-	arena->decay_backlog[SMOOTHSTEP_NSTEPS-1] = ndirty_delta;
-	arena->decay_backlog_npages_limit =
-	    arena_decay_backlog_npages_limit(arena);
+
+	arena_decay_backlog_update_last(arena);
 }
 
-static size_t
-arena_decay_npages_limit(arena_t *arena)
+static void
+arena_decay_epoch_advance_helper(arena_t *arena, const nstime_t *time)
 {
-	size_t npages_limit;
+	uint64_t nadvance_u64;
+	nstime_t delta;
 
 	assert(opt_purge == purge_mode_decay);
+	assert(arena_decay_deadline_reached(arena, time));
 
-	npages_limit = arena->decay_backlog_npages_limit;
+	nstime_copy(&delta, time);
+	nstime_subtract(&delta, &arena->decay.epoch);
+	nadvance_u64 = nstime_divide(&delta, &arena->decay.interval);
+	assert(nadvance_u64 > 0);
 
-	/* Add in any dirty pages created during the current epoch. */
-	if (arena->ndirty > arena->decay_ndirty)
-		npages_limit += arena->ndirty - arena->decay_ndirty;
+	/* Add nadvance_u64 decay intervals to epoch. */
+	nstime_copy(&delta, &arena->decay.interval);
+	nstime_imultiply(&delta, nadvance_u64);
+	nstime_add(&arena->decay.epoch, &delta);
 
-	return (npages_limit);
+	/* Set a new deadline. */
+	arena_decay_deadline_init(arena);
+
+	/* Update the backlog. */
+	arena_decay_backlog_update(arena, nadvance_u64);
+}
+
+static void
+arena_decay_epoch_advance_purge(tsdn_t *tsdn, arena_t *arena)
+{
+	size_t ndirty_limit = arena_decay_backlog_npages_limit(arena);
+
+	if (arena->ndirty > ndirty_limit)
+		arena_purge_to_limit(tsdn, arena, ndirty_limit);
+	arena->decay.ndirty = arena->ndirty;
+}
+
+static void
+arena_decay_epoch_advance(tsdn_t *tsdn, arena_t *arena, const nstime_t *time)
+{
+
+	arena_decay_epoch_advance_helper(arena, time);
+	arena_decay_epoch_advance_purge(tsdn, arena);
 }
 
 static void
 arena_decay_init(arena_t *arena, ssize_t decay_time)
 {
 
-	arena->decay_time = decay_time;
+	arena->decay.time = decay_time;
 	if (decay_time > 0) {
-		nstime_init2(&arena->decay_interval, decay_time, 0);
-		nstime_idivide(&arena->decay_interval, SMOOTHSTEP_NSTEPS);
+		nstime_init2(&arena->decay.interval, decay_time, 0);
+		nstime_idivide(&arena->decay.interval, SMOOTHSTEP_NSTEPS);
 	}
 
-	nstime_init(&arena->decay_epoch, 0);
-	nstime_update(&arena->decay_epoch);
-	arena->decay_jitter_state = (uint64_t)(uintptr_t)arena;
+	nstime_init(&arena->decay.epoch, 0);
+	nstime_update(&arena->decay.epoch);
+	arena->decay.jitter_state = (uint64_t)(uintptr_t)arena;
 	arena_decay_deadline_init(arena);
-	arena->decay_ndirty = arena->ndirty;
-	arena->decay_backlog_npages_limit = 0;
-	memset(arena->decay_backlog, 0, SMOOTHSTEP_NSTEPS * sizeof(size_t));
+	arena->decay.ndirty = arena->ndirty;
+	memset(arena->decay.backlog, 0, SMOOTHSTEP_NSTEPS * sizeof(size_t));
 }
 
 static bool
@@ -1403,7 +1348,7 @@ arena_decay_time_get(tsdn_t *tsdn, arena_t *arena)
 	ssize_t decay_time;
 
 	malloc_mutex_lock(tsdn, &arena->lock);
-	decay_time = arena->decay_time;
+	decay_time = arena->decay.time;
 	malloc_mutex_unlock(tsdn, &arena->lock);
 
 	return (decay_time);
@@ -1464,35 +1409,44 @@ static void
 arena_maybe_purge_decay(tsdn_t *tsdn, arena_t *arena)
 {
 	nstime_t time;
-	size_t ndirty_limit;
 
 	assert(opt_purge == purge_mode_decay);
 
 	/* Purge all or nothing if the option is disabled. */
-	if (arena->decay_time <= 0) {
-		if (arena->decay_time == 0)
+	if (arena->decay.time <= 0) {
+		if (arena->decay.time == 0)
 			arena_purge_to_limit(tsdn, arena, 0);
 		return;
 	}
 
-	nstime_copy(&time, &arena->decay_epoch);
-	if (unlikely(nstime_update(&time))) {
-		/* Time went backwards.  Force an epoch advance. */
-		nstime_copy(&time, &arena->decay_deadline);
+	nstime_init(&time, 0);
+	nstime_update(&time);
+	if (unlikely(!nstime_monotonic() && nstime_compare(&arena->decay.epoch,
+	    &time) > 0)) {
+		/*
+		 * Time went backwards.  Move the epoch back in time and
+		 * generate a new deadline, with the expectation that time
+		 * typically flows forward for long enough periods of time that
+		 * epochs complete.  Unfortunately, this strategy is susceptible
+		 * to clock jitter triggering premature epoch advances, but
+		 * clock jitter estimation and compensation isn't feasible here
+		 * because calls into this code are event-driven.
+		 */
+		nstime_copy(&arena->decay.epoch, &time);
+		arena_decay_deadline_init(arena);
+	} else {
+		/* Verify that time does not go backwards. */
+		assert(nstime_compare(&arena->decay.epoch, &time) <= 0);
 	}
 
-	if (arena_decay_deadline_reached(arena, &time))
-		arena_decay_epoch_advance(arena, &time);
-
-	ndirty_limit = arena_decay_npages_limit(arena);
-
 	/*
-	 * Don't try to purge unless the number of purgeable pages exceeds the
-	 * current limit.
+	 * If the deadline has been reached, advance to the current epoch and
+	 * purge to the new limit if necessary.  Note that dirty pages created
+	 * during the current epoch are not subject to purge until a future
+	 * epoch, so as a result purging only happens during epoch advances.
 	 */
-	if (arena->ndirty <= ndirty_limit)
-		return;
-	arena_purge_to_limit(tsdn, arena, ndirty_limit);
+	if (arena_decay_deadline_reached(arena, &time))
+		arena_decay_epoch_advance(tsdn, arena, &time);
 }
 
 void
@@ -1561,7 +1515,7 @@ arena_stash_dirty(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 
 		if (rdelm == &chunkselm->rd) {
 			extent_node_t *chunkselm_next;
-			bool zero;
+			bool zero, commit;
 			UNUSED void *chunk;
 
 			npages = extent_node_size_get(chunkselm) >> LG_PAGE;
@@ -1575,10 +1529,11 @@ arena_stash_dirty(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 			 * dalloc_node=false argument to chunk_alloc_cache().
 			 */
 			zero = false;
+			commit = false;
 			chunk = chunk_alloc_cache(tsdn, arena, chunk_hooks,
 			    extent_node_addr_get(chunkselm),
 			    extent_node_size_get(chunkselm), chunksize, &zero,
-			    false);
+			    &commit, false);
 			assert(chunk == extent_node_addr_get(chunkselm));
 			assert(zero == extent_node_zeroed_get(chunkselm));
 			extent_node_dirty_insert(chunkselm, purge_runs_sentinel,
@@ -1967,7 +1922,8 @@ arena_reset(tsd_t *tsd, arena_t *arena)
 	assert(!arena->purging);
 	arena->nactive = 0;
 
-	for(i = 0; i < runs_avail_nclasses; i++)
+	for (i = 0; i < sizeof(arena->runs_avail) / sizeof(arena_run_heap_t);
+	    i++)
 		arena_run_heap_new(&arena->runs_avail[i]);
 
 	malloc_mutex_unlock(tsd_tsdn(tsd), &arena->lock);
@@ -3391,7 +3347,7 @@ arena_basic_stats_merge_locked(arena_t *arena, unsigned *nthreads,
 	*nthreads += arena_nthreads_get(arena, false);
 	*dss = dss_prec_names[arena->dss_prec];
 	*lg_dirty_mult = arena->lg_dirty_mult;
-	*decay_time = arena->decay_time;
+	*decay_time = arena->decay.time;
 	*nactive += arena->nactive;
 	*ndirty += arena->ndirty;
 }
@@ -3496,23 +3452,19 @@ arena_t *
 arena_new(tsdn_t *tsdn, unsigned ind)
 {
 	arena_t *arena;
-	size_t arena_size;
 	unsigned i;
 
-	/* Compute arena size to incorporate sufficient runs_avail elements. */
-	arena_size = offsetof(arena_t, runs_avail) + (sizeof(arena_run_heap_t) *
-	    runs_avail_nclasses);
 	/*
 	 * Allocate arena, arena->lstats, and arena->hstats contiguously, mainly
 	 * because there is no way to clean up if base_alloc() OOMs.
 	 */
 	if (config_stats) {
 		arena = (arena_t *)base_alloc(tsdn,
-		    CACHELINE_CEILING(arena_size) + QUANTUM_CEILING(nlclasses *
-		    sizeof(malloc_large_stats_t) + nhclasses) *
-		    sizeof(malloc_huge_stats_t));
+		    CACHELINE_CEILING(sizeof(arena_t)) +
+		    QUANTUM_CEILING((nlclasses * sizeof(malloc_large_stats_t)) +
+		    (nhclasses * sizeof(malloc_huge_stats_t))));
 	} else
-		arena = (arena_t *)base_alloc(tsdn, arena_size);
+		arena = (arena_t *)base_alloc(tsdn, sizeof(arena_t));
 	if (arena == NULL)
 		return (NULL);
 
@@ -3524,11 +3476,11 @@ arena_new(tsdn_t *tsdn, unsigned ind)
 	if (config_stats) {
 		memset(&arena->stats, 0, sizeof(arena_stats_t));
 		arena->stats.lstats = (malloc_large_stats_t *)((uintptr_t)arena
-		    + CACHELINE_CEILING(arena_size));
+		    + CACHELINE_CEILING(sizeof(arena_t)));
 		memset(arena->stats.lstats, 0, nlclasses *
 		    sizeof(malloc_large_stats_t));
 		arena->stats.hstats = (malloc_huge_stats_t *)((uintptr_t)arena
-		    + CACHELINE_CEILING(arena_size) +
+		    + CACHELINE_CEILING(sizeof(arena_t)) +
 		    QUANTUM_CEILING(nlclasses * sizeof(malloc_large_stats_t)));
 		memset(arena->stats.hstats, 0, nhclasses *
 		    sizeof(malloc_huge_stats_t));
@@ -3551,7 +3503,7 @@ arena_new(tsdn_t *tsdn, unsigned ind)
 		    (uint64_t)(uintptr_t)arena;
 	}
 
-	arena->dss_prec = chunk_dss_prec_get(tsdn);
+	arena->dss_prec = chunk_dss_prec_get();
 
 	ql_new(&arena->achunks);
 
@@ -3562,8 +3514,10 @@ arena_new(tsdn_t *tsdn, unsigned ind)
 	arena->nactive = 0;
 	arena->ndirty = 0;
 
-	for(i = 0; i < runs_avail_nclasses; i++)
+	for (i = 0; i < sizeof(arena->runs_avail) / sizeof(arena_run_heap_t);
+	    i++)
 		arena_run_heap_new(&arena->runs_avail[i]);
+
 	qr_new(&arena->runs_dirty, rd_link);
 	qr_new(&arena->chunks_cache, cc_link);
 
@@ -3693,9 +3647,6 @@ bin_info_run_size_calc(arena_bin_info_t *bin_info)
 	bin_info->reg0_offset = (uint32_t)(actual_run_size - (actual_nregs *
 	    bin_info->reg_interval) - pad_size + bin_info->redzone_size);
 
-	if (actual_run_size > small_maxrun)
-		small_maxrun = actual_run_size;
-
 	assert(bin_info->reg0_offset - bin_info->redzone_size + (bin_info->nregs
 	    * bin_info->reg_interval) + pad_size == bin_info->run_size);
 }
@@ -3711,7 +3662,7 @@ bin_info_init(void)
 	bin_info_run_size_calc(bin_info);				\
 	bitmap_info_init(&bin_info->bitmap_info, bin_info->nregs);
 #define	BIN_INFO_INIT_bin_no(index, size)
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
+#define	SC(index, lg_grp, lg_delta, ndelta, psz, bin, lg_delta_lookup)	\
 	BIN_INFO_INIT_bin_##bin(index, (ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
 	SIZE_CLASSES
 #undef BIN_INFO_INIT_bin_yes
@@ -3719,62 +3670,7 @@ bin_info_init(void)
 #undef SC
 }
 
-static bool
-small_run_size_init(void)
-{
-
-	assert(small_maxrun != 0);
-
-	small_run_tab = (bool *)base_alloc(NULL, sizeof(bool) * (small_maxrun >>
-	    LG_PAGE));
-	if (small_run_tab == NULL)
-		return (true);
-
-#define	TAB_INIT_bin_yes(index, size) {					\
-		arena_bin_info_t *bin_info = &arena_bin_info[index];	\
-		small_run_tab[bin_info->run_size >> LG_PAGE] = true;	\
-	}
-#define	TAB_INIT_bin_no(index, size)
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup)	\
-	TAB_INIT_bin_##bin(index, (ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))
-	SIZE_CLASSES
-#undef TAB_INIT_bin_yes
-#undef TAB_INIT_bin_no
-#undef SC
-
-	return (false);
-}
-
-static bool
-run_quantize_init(void)
-{
-	unsigned i;
-
-	run_quantize_max = chunksize + large_pad;
-
-	run_quantize_floor_tab = (size_t *)base_alloc(NULL, sizeof(size_t) *
-	    (run_quantize_max >> LG_PAGE));
-	if (run_quantize_floor_tab == NULL)
-		return (true);
-
-	run_quantize_ceil_tab = (size_t *)base_alloc(NULL, sizeof(size_t) *
-	    (run_quantize_max >> LG_PAGE));
-	if (run_quantize_ceil_tab == NULL)
-		return (true);
-
-	for (i = 1; i <= run_quantize_max >> LG_PAGE; i++) {
-		size_t run_size = i << LG_PAGE;
-
-		run_quantize_floor_tab[i-1] =
-		    run_quantize_floor_compute(run_size);
-		run_quantize_ceil_tab[i-1] =
-		    run_quantize_ceil_compute(run_size);
-	}
-
-	return (false);
-}
-
-bool
+void
 arena_boot(void)
 {
 	unsigned i;
@@ -3822,15 +3718,6 @@ arena_boot(void)
 	nhclasses = NSIZES - nlclasses - NBINS;
 
 	bin_info_init();
-	if (small_run_size_init())
-		return (true);
-	if (run_quantize_init())
-		return (true);
-
-	runs_avail_bias = size2index(PAGE);
-	runs_avail_nclasses = size2index(run_quantize_max)+1 - runs_avail_bias;
-
-	return (false);
 }
 
 void
diff --git a/src/chunk.c b/src/chunk.c
index f292c980..07e26f77 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -316,10 +316,11 @@ chunk_recycle(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 			size_t i;
 			size_t *p = (size_t *)(uintptr_t)ret;
 
-			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
 			for (i = 0; i < size / sizeof(size_t); i++)
 				assert(p[i] == 0);
 		}
+		if (config_valgrind)
+			JEMALLOC_VALGRIND_MAKE_MEM_DEFINED(ret, size);
 	}
 	return (ret);
 }
@@ -384,23 +385,21 @@ chunk_alloc_base(size_t size)
 
 void *
 chunk_alloc_cache(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
-    void *new_addr, size_t size, size_t alignment, bool *zero, bool dalloc_node)
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit,
+    bool dalloc_node)
 {
 	void *ret;
-	bool commit;
 
 	assert(size != 0);
 	assert((size & chunksize_mask) == 0);
 	assert(alignment != 0);
 	assert((alignment & chunksize_mask) == 0);
 
-	commit = true;
 	ret = chunk_recycle(tsdn, arena, chunk_hooks,
 	    &arena->chunks_szad_cached, &arena->chunks_ad_cached, true,
-	    new_addr, size, alignment, zero, &commit, dalloc_node);
+	    new_addr, size, alignment, zero, commit, dalloc_node);
 	if (ret == NULL)
 		return (NULL);
-	assert(commit);
 	if (config_valgrind)
 		JEMALLOC_VALGRIND_MAKE_MEM_UNDEFINED(ret, size);
 	return (ret);
@@ -610,10 +609,10 @@ chunk_dalloc_cache(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 }
 
 static bool
-chunk_dalloc_default_impl(tsdn_t *tsdn, void *chunk, size_t size)
+chunk_dalloc_default_impl(void *chunk, size_t size)
 {
 
-	if (!have_dss || !chunk_in_dss(tsdn, chunk))
+	if (!have_dss || !chunk_in_dss(chunk))
 		return (chunk_dalloc_mmap(chunk, size));
 	return (true);
 }
@@ -622,11 +621,8 @@ static bool
 chunk_dalloc_default(void *chunk, size_t size, bool committed,
     unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
-	tsdn = tsdn_fetch();
-
-	return (chunk_dalloc_default_impl(tsdn, chunk, size));
+	return (chunk_dalloc_default_impl(chunk, size));
 }
 
 void
@@ -644,7 +640,7 @@ chunk_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena, chunk_hooks_t *chunk_hooks,
 	/* Try to deallocate. */
 	if (chunk_hooks->dalloc == chunk_dalloc_default) {
 		/* Call directly to propagate tsdn. */
-		err = chunk_dalloc_default_impl(tsdn, chunk, size);
+		err = chunk_dalloc_default_impl(chunk, size);
 	} else
 		err = chunk_hooks->dalloc(chunk, size, committed, arena->ind);
 
@@ -717,13 +713,12 @@ chunk_split_default(void *chunk, size_t size, size_t size_a, size_t size_b,
 }
 
 static bool
-chunk_merge_default_impl(tsdn_t *tsdn, void *chunk_a, void *chunk_b)
+chunk_merge_default_impl(void *chunk_a, void *chunk_b)
 {
 
 	if (!maps_coalesce)
 		return (true);
-	if (have_dss && chunk_in_dss(tsdn, chunk_a) != chunk_in_dss(tsdn,
-	    chunk_b))
+	if (have_dss && !chunk_dss_mergeable(chunk_a, chunk_b))
 		return (true);
 
 	return (false);
@@ -733,11 +728,8 @@ static bool
 chunk_merge_default(void *chunk_a, size_t size_a, void *chunk_b, size_t size_b,
     bool committed, unsigned arena_ind)
 {
-	tsdn_t *tsdn;
 
-	tsdn = tsdn_fetch();
-
-	return (chunk_merge_default_impl(tsdn, chunk_a, chunk_b));
+	return (chunk_merge_default_impl(chunk_a, chunk_b));
 }
 
 static rtree_node_elm_t *
@@ -781,32 +773,11 @@ chunk_boot(void)
 	chunksize_mask = chunksize - 1;
 	chunk_npages = (chunksize >> LG_PAGE);
 
-	if (have_dss && chunk_dss_boot())
-		return (true);
+	if (have_dss)
+		chunk_dss_boot();
 	if (rtree_new(&chunks_rtree, (unsigned)((ZU(1) << (LG_SIZEOF_PTR+3)) -
 	    opt_lg_chunk), chunks_rtree_node_alloc, NULL))
 		return (true);
 
 	return (false);
 }
-
-void
-chunk_prefork(tsdn_t *tsdn)
-{
-
-	chunk_dss_prefork(tsdn);
-}
-
-void
-chunk_postfork_parent(tsdn_t *tsdn)
-{
-
-	chunk_dss_postfork_parent(tsdn);
-}
-
-void
-chunk_postfork_child(tsdn_t *tsdn)
-{
-
-	chunk_dss_postfork_child(tsdn);
-}
diff --git a/src/chunk_dss.c b/src/chunk_dss.c
index 0b1f82bd..85a13548 100644
--- a/src/chunk_dss.c
+++ b/src/chunk_dss.c
@@ -10,20 +10,19 @@ const char	*dss_prec_names[] = {
 	"N/A"
 };
 
-/* Current dss precedence default, used when creating new arenas. */
-static dss_prec_t	dss_prec_default = DSS_PREC_DEFAULT;
-
 /*
- * Protects sbrk() calls.  This avoids malloc races among threads, though it
- * does not protect against races with threads that call sbrk() directly.
+ * Current dss precedence default, used when creating new arenas.  NB: This is
+ * stored as unsigned rather than dss_prec_t because in principle there's no
+ * guarantee that sizeof(dss_prec_t) is the same as sizeof(unsigned), and we use
+ * atomic operations to synchronize the setting.
  */
-static malloc_mutex_t	dss_mtx;
+static unsigned		dss_prec_default = (unsigned)DSS_PREC_DEFAULT;
 
 /* Base address of the DSS. */
 static void		*dss_base;
-/* Current end of the DSS, or ((void *)-1) if the DSS is exhausted. */
-static void		*dss_prev;
-/* Current upper limit on DSS addresses. */
+/* Atomic boolean indicating whether the DSS is exhausted. */
+static unsigned		dss_exhausted;
+/* Atomic current upper limit on DSS addresses. */
 static void		*dss_max;
 
 /******************************************************************************/
@@ -41,30 +40,59 @@ chunk_dss_sbrk(intptr_t increment)
 }
 
 dss_prec_t
-chunk_dss_prec_get(tsdn_t *tsdn)
+chunk_dss_prec_get(void)
 {
 	dss_prec_t ret;
 
 	if (!have_dss)
 		return (dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	ret = dss_prec_default;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	ret = (dss_prec_t)atomic_read_u(&dss_prec_default);
 	return (ret);
 }
 
 bool
-chunk_dss_prec_set(tsdn_t *tsdn, dss_prec_t dss_prec)
+chunk_dss_prec_set(dss_prec_t dss_prec)
 {
 
 	if (!have_dss)
 		return (dss_prec != dss_prec_disabled);
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	dss_prec_default = dss_prec;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
+	atomic_write_u(&dss_prec_default, (unsigned)dss_prec);
 	return (false);
 }
 
+static void *
+chunk_dss_max_update(void *new_addr)
+{
+	void *max_cur;
+	spin_t spinner;
+
+	/*
+	 * Get the current end of the DSS as max_cur and assure that dss_max is
+	 * up to date.
+	 */
+	spin_init(&spinner);
+	while (true) {
+		void *max_prev = atomic_read_p(&dss_max);
+
+		max_cur = chunk_dss_sbrk(0);
+		if ((uintptr_t)max_prev > (uintptr_t)max_cur) {
+			/*
+			 * Another thread optimistically updated dss_max.  Wait
+			 * for it to finish.
+			 */
+			spin_adaptive(&spinner);
+			continue;
+		}
+		if (!atomic_cas_p(&dss_max, max_prev, max_cur))
+			break;
+	}
+	/* Fixed new_addr can only be supported if it is at the edge of DSS. */
+	if (new_addr != NULL && max_cur != new_addr)
+		return (NULL);
+
+	return (max_cur);
+}
+
 void *
 chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
     size_t alignment, bool *zero, bool *commit)
@@ -80,28 +108,20 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 	if ((intptr_t)size < 0)
 		return (NULL);
 
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if (dss_prev != (void *)-1) {
-
+	if (!atomic_read_u(&dss_exhausted)) {
 		/*
 		 * The loop is necessary to recover from races with other
 		 * threads that are using the DSS for something other than
 		 * malloc.
 		 */
-		do {
-			void *ret, *cpad, *dss_next;
+		while (true) {
+			void *ret, *cpad, *max_cur, *dss_next, *dss_prev;
 			size_t gap_size, cpad_size;
 			intptr_t incr;
-			/* Avoid an unnecessary system call. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
 
-			/* Get the current end of the DSS. */
-			dss_max = chunk_dss_sbrk(0);
-
-			/* Make sure the earlier condition still holds. */
-			if (new_addr != NULL && dss_max != new_addr)
-				break;
+			max_cur = chunk_dss_max_update(new_addr);
+			if (max_cur == NULL)
+				goto label_oom;
 
 			/*
 			 * Calculate how much padding is necessary to
@@ -120,17 +140,23 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 			cpad_size = (uintptr_t)ret - (uintptr_t)cpad;
 			dss_next = (void *)((uintptr_t)ret + size);
 			if ((uintptr_t)ret < (uintptr_t)dss_max ||
-			    (uintptr_t)dss_next < (uintptr_t)dss_max) {
-				/* Wrap-around. */
-				malloc_mutex_unlock(tsdn, &dss_mtx);
-				return (NULL);
-			}
+			    (uintptr_t)dss_next < (uintptr_t)dss_max)
+				goto label_oom; /* Wrap-around. */
 			incr = gap_size + cpad_size + size;
+
+			/*
+			 * Optimistically update dss_max, and roll back below if
+			 * sbrk() fails.  No other thread will try to extend the
+			 * DSS while dss_max is greater than the current DSS
+			 * max reported by sbrk(0).
+			 */
+			if (atomic_cas_p(&dss_max, max_cur, dss_next))
+				continue;
+
+			/* Try to allocate. */
 			dss_prev = chunk_dss_sbrk(incr);
-			if (dss_prev == dss_max) {
+			if (dss_prev == max_cur) {
 				/* Success. */
-				dss_max = dss_next;
-				malloc_mutex_unlock(tsdn, &dss_mtx);
 				if (cpad_size != 0) {
 					chunk_hooks_t chunk_hooks =
 					    CHUNK_HOOKS_INITIALIZER;
@@ -147,68 +173,65 @@ chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr, size_t size,
 					*commit = pages_decommit(ret, size);
 				return (ret);
 			}
-		} while (dss_prev != (void *)-1);
-	}
-	malloc_mutex_unlock(tsdn, &dss_mtx);
 
+			/*
+			 * Failure, whether due to OOM or a race with a raw
+			 * sbrk() call from outside the allocator.  Try to roll
+			 * back optimistic dss_max update; if rollback fails,
+			 * it's due to another caller of this function having
+			 * succeeded since this invocation started, in which
+			 * case rollback is not necessary.
+			 */
+			atomic_cas_p(&dss_max, dss_next, max_cur);
+			if (dss_prev == (void *)-1) {
+				/* OOM. */
+				atomic_write_u(&dss_exhausted, (unsigned)true);
+				goto label_oom;
+			}
+		}
+	}
+label_oom:
 	return (NULL);
 }
 
-bool
-chunk_in_dss(tsdn_t *tsdn, void *chunk)
+static bool
+chunk_in_dss_helper(void *chunk, void *max)
 {
-	bool ret;
 
-	cassert(have_dss);
-
-	malloc_mutex_lock(tsdn, &dss_mtx);
-	if ((uintptr_t)chunk >= (uintptr_t)dss_base
-	    && (uintptr_t)chunk < (uintptr_t)dss_max)
-		ret = true;
-	else
-		ret = false;
-	malloc_mutex_unlock(tsdn, &dss_mtx);
-
-	return (ret);
+	return ((uintptr_t)chunk >= (uintptr_t)dss_base && (uintptr_t)chunk <
+	    (uintptr_t)max);
 }
 
 bool
+chunk_in_dss(void *chunk)
+{
+
+	cassert(have_dss);
+
+	return (chunk_in_dss_helper(chunk, atomic_read_p(&dss_max)));
+}
+
+bool
+chunk_dss_mergeable(void *chunk_a, void *chunk_b)
+{
+	void *max;
+
+	cassert(have_dss);
+
+	max = atomic_read_p(&dss_max);
+	return (chunk_in_dss_helper(chunk_a, max) ==
+	    chunk_in_dss_helper(chunk_b, max));
+}
+
+void
 chunk_dss_boot(void)
 {
 
 	cassert(have_dss);
 
-	if (malloc_mutex_init(&dss_mtx, "dss", WITNESS_RANK_DSS))
-		return (true);
 	dss_base = chunk_dss_sbrk(0);
-	dss_prev = dss_base;
+	dss_exhausted = (unsigned)(dss_base == (void *)-1);
 	dss_max = dss_base;
-
-	return (false);
-}
-
-void
-chunk_dss_prefork(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_prefork(tsdn, &dss_mtx);
-}
-
-void
-chunk_dss_postfork_parent(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_parent(tsdn, &dss_mtx);
-}
-
-void
-chunk_dss_postfork_child(tsdn_t *tsdn)
-{
-
-	if (have_dss)
-		malloc_mutex_postfork_child(tsdn, &dss_mtx);
 }
 
 /******************************************************************************/
diff --git a/src/ckh.c b/src/ckh.c
index 747c1c86..3be671c3 100644
--- a/src/ckh.c
+++ b/src/ckh.c
@@ -40,8 +40,8 @@
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
 
-static bool	ckh_grow(tsdn_t *tsdn, ckh_t *ckh);
-static void	ckh_shrink(tsdn_t *tsdn, ckh_t *ckh);
+static bool	ckh_grow(tsd_t *tsd, ckh_t *ckh);
+static void	ckh_shrink(tsd_t *tsd, ckh_t *ckh);
 
 /******************************************************************************/
 
@@ -244,7 +244,7 @@ ckh_rebuild(ckh_t *ckh, ckhc_t *aTab)
 }
 
 static bool
-ckh_grow(tsdn_t *tsdn, ckh_t *ckh)
+ckh_grow(tsd_t *tsd, ckh_t *ckh)
 {
 	bool ret;
 	ckhc_t *tab, *ttab;
@@ -270,8 +270,8 @@ ckh_grow(tsdn_t *tsdn, ckh_t *ckh)
 			ret = true;
 			goto label_return;
 		}
-		tab = (ckhc_t *)ipallocztm(tsdn, usize, CACHELINE, true, NULL,
-		    true, arena_ichoose(tsdn, NULL));
+		tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE,
+		    true, NULL, true, arena_ichoose(tsd, NULL));
 		if (tab == NULL) {
 			ret = true;
 			goto label_return;
@@ -283,12 +283,12 @@ ckh_grow(tsdn_t *tsdn, ckh_t *ckh)
 		ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 		if (!ckh_rebuild(ckh, tab)) {
-			idalloctm(tsdn, tab, NULL, true, true);
+			idalloctm(tsd_tsdn(tsd), tab, NULL, true, true);
 			break;
 		}
 
 		/* Rebuilding failed, so back out partially rebuilt table. */
-		idalloctm(tsdn, ckh->tab, NULL, true, true);
+		idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, true, true);
 		ckh->tab = tab;
 		ckh->lg_curbuckets = lg_prevbuckets;
 	}
@@ -299,7 +299,7 @@ label_return:
 }
 
 static void
-ckh_shrink(tsdn_t *tsdn, ckh_t *ckh)
+ckh_shrink(tsd_t *tsd, ckh_t *ckh)
 {
 	ckhc_t *tab, *ttab;
 	size_t usize;
@@ -314,8 +314,8 @@ ckh_shrink(tsdn_t *tsdn, ckh_t *ckh)
 	usize = sa2u(sizeof(ckhc_t) << lg_curcells, CACHELINE);
 	if (unlikely(usize == 0 || usize > HUGE_MAXCLASS))
 		return;
-	tab = (ckhc_t *)ipallocztm(tsdn, usize, CACHELINE, true, NULL, true,
-	    arena_ichoose(tsdn, NULL));
+	tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true, NULL,
+	    true, arena_ichoose(tsd, NULL));
 	if (tab == NULL) {
 		/*
 		 * An OOM error isn't worth propagating, since it doesn't
@@ -330,7 +330,7 @@ ckh_shrink(tsdn_t *tsdn, ckh_t *ckh)
 	ckh->lg_curbuckets = lg_curcells - LG_CKH_BUCKET_CELLS;
 
 	if (!ckh_rebuild(ckh, tab)) {
-		idalloctm(tsdn, tab, NULL, true, true);
+		idalloctm(tsd_tsdn(tsd), tab, NULL, true, true);
 #ifdef CKH_COUNT
 		ckh->nshrinks++;
 #endif
@@ -338,7 +338,7 @@ ckh_shrink(tsdn_t *tsdn, ckh_t *ckh)
 	}
 
 	/* Rebuilding failed, so back out partially rebuilt table. */
-	idalloctm(tsdn, ckh->tab, NULL, true, true);
+	idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, true, true);
 	ckh->tab = tab;
 	ckh->lg_curbuckets = lg_prevbuckets;
 #ifdef CKH_COUNT
@@ -347,7 +347,7 @@ ckh_shrink(tsdn_t *tsdn, ckh_t *ckh)
 }
 
 bool
-ckh_new(tsdn_t *tsdn, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
     ckh_keycomp_t *keycomp)
 {
 	bool ret;
@@ -391,8 +391,8 @@ ckh_new(tsdn_t *tsdn, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
 		ret = true;
 		goto label_return;
 	}
-	ckh->tab = (ckhc_t *)ipallocztm(tsdn, usize, CACHELINE, true, NULL,
-	    true, arena_ichoose(tsdn, NULL));
+	ckh->tab = (ckhc_t *)ipallocztm(tsd_tsdn(tsd), usize, CACHELINE, true,
+	    NULL, true, arena_ichoose(tsd, NULL));
 	if (ckh->tab == NULL) {
 		ret = true;
 		goto label_return;
@@ -404,7 +404,7 @@ label_return:
 }
 
 void
-ckh_delete(tsdn_t *tsdn, ckh_t *ckh)
+ckh_delete(tsd_t *tsd, ckh_t *ckh)
 {
 
 	assert(ckh != NULL);
@@ -421,7 +421,7 @@ ckh_delete(tsdn_t *tsdn, ckh_t *ckh)
 	    (unsigned long long)ckh->nrelocs);
 #endif
 
-	idalloctm(tsdn, ckh->tab, NULL, true, true);
+	idalloctm(tsd_tsdn(tsd), ckh->tab, NULL, true, true);
 	if (config_debug)
 		memset(ckh, JEMALLOC_FREE_JUNK, sizeof(ckh_t));
 }
@@ -456,7 +456,7 @@ ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data)
 }
 
 bool
-ckh_insert(tsdn_t *tsdn, ckh_t *ckh, const void *key, const void *data)
+ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data)
 {
 	bool ret;
 
@@ -468,7 +468,7 @@ ckh_insert(tsdn_t *tsdn, ckh_t *ckh, const void *key, const void *data)
 #endif
 
 	while (ckh_try_insert(ckh, &key, &data)) {
-		if (ckh_grow(tsdn, ckh)) {
+		if (ckh_grow(tsd, ckh)) {
 			ret = true;
 			goto label_return;
 		}
@@ -480,7 +480,7 @@ label_return:
 }
 
 bool
-ckh_remove(tsdn_t *tsdn, ckh_t *ckh, const void *searchkey, void **key,
+ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
     void **data)
 {
 	size_t cell;
@@ -502,7 +502,7 @@ ckh_remove(tsdn_t *tsdn, ckh_t *ckh, const void *searchkey, void **key,
 		    + LG_CKH_BUCKET_CELLS - 2)) && ckh->lg_curbuckets
 		    > ckh->lg_minbuckets) {
 			/* Ignore error due to OOM. */
-			ckh_shrink(tsdn, ckh);
+			ckh_shrink(tsd, ckh);
 		}
 
 		return (false);
diff --git a/src/ctl.c b/src/ctl.c
index dad80086..bc78b205 100644
--- a/src/ctl.c
+++ b/src/ctl.c
@@ -1478,7 +1478,7 @@ tcache_create_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 
 	malloc_mutex_lock(tsd_tsdn(tsd), &ctl_mtx);
 	READONLY();
-	if (tcaches_create(tsd_tsdn(tsd), &tcache_ind)) {
+	if (tcaches_create(tsd, &tcache_ind)) {
 		ret = EFAULT;
 		goto label_return;
 	}
@@ -1685,11 +1685,11 @@ arena_i_dss_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 		dss_prec_old = arena_dss_prec_get(tsd_tsdn(tsd), arena);
 	} else {
 		if (dss_prec != dss_prec_limit &&
-		    chunk_dss_prec_set(tsd_tsdn(tsd), dss_prec)) {
+		    chunk_dss_prec_set(dss_prec)) {
 			ret = EFAULT;
 			goto label_return;
 		}
-		dss_prec_old = chunk_dss_prec_get(tsd_tsdn(tsd));
+		dss_prec_old = chunk_dss_prec_get();
 	}
 
 	dss = dss_prec_names[dss_prec_old];
@@ -2100,7 +2100,7 @@ prof_reset_ctl(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
 	if (lg_sample >= (sizeof(uint64_t) << 3))
 		lg_sample = (sizeof(uint64_t) << 3) - 1;
 
-	prof_reset(tsd_tsdn(tsd), lg_sample);
+	prof_reset(tsd, lg_sample);
 
 	ret = 0;
 label_return:
diff --git a/src/huge.c b/src/huge.c
index 3a2877ca..62e6932b 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -54,6 +54,7 @@ huge_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 {
 	void *ret;
 	size_t ausize;
+	arena_t *iarena;
 	extent_node_t *node;
 	bool is_zeroed;
 
@@ -67,8 +68,9 @@ huge_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 	assert(ausize >= chunksize);
 
 	/* Allocate an extent node with which to track the chunk. */
+	iarena = (!tsdn_null(tsdn)) ? arena_ichoose(tsdn_tsd(tsdn), NULL) : a0get();
 	node = ipallocztm(tsdn, CACHELINE_CEILING(sizeof(extent_node_t)),
-	    CACHELINE, false, NULL, true, arena_ichoose(tsdn, arena));
+	    CACHELINE, false, NULL, true, iarena);
 	if (node == NULL)
 		return (NULL);
 
@@ -114,7 +116,7 @@ huge_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
 #define	huge_dalloc_junk JEMALLOC_N(huge_dalloc_junk_impl)
 #endif
 static void
-huge_dalloc_junk(tsdn_t *tsdn, void *ptr, size_t usize)
+huge_dalloc_junk(void *ptr, size_t usize)
 {
 
 	if (config_fill && have_dss && unlikely(opt_junk_free)) {
@@ -122,7 +124,7 @@ huge_dalloc_junk(tsdn_t *tsdn, void *ptr, size_t usize)
 		 * Only bother junk filling if the chunk isn't about to be
 		 * unmapped.
 		 */
-		if (!config_munmap || (have_dss && chunk_in_dss(tsdn, ptr)))
+		if (!config_munmap || (have_dss && chunk_in_dss(ptr)))
 			memset(ptr, JEMALLOC_FREE_JUNK, usize);
 	}
 }
@@ -221,7 +223,7 @@ huge_ralloc_no_move_shrink(tsdn_t *tsdn, void *ptr, size_t oldsize,
 	if (oldsize > usize) {
 		size_t sdiff = oldsize - usize;
 		if (config_fill && unlikely(opt_junk_free)) {
-			huge_dalloc_junk(tsdn, (void *)((uintptr_t)ptr + usize),
+			huge_dalloc_junk((void *)((uintptr_t)ptr + usize),
 			    sdiff);
 			post_zeroed = false;
 		} else {
@@ -402,7 +404,7 @@ huge_dalloc(tsdn_t *tsdn, void *ptr)
 	ql_remove(&arena->huge, node, ql_link);
 	malloc_mutex_unlock(tsdn, &arena->huge_mtx);
 
-	huge_dalloc_junk(tsdn, extent_node_addr_get(node),
+	huge_dalloc_junk(extent_node_addr_get(node),
 	    extent_node_size_get(node));
 	arena_chunk_dalloc_huge(tsdn, extent_node_arena_get(node),
 	    extent_node_addr_get(node), extent_node_size_get(node));
diff --git a/src/jemalloc.c b/src/jemalloc.c
index 5d1f4937..38650ff0 100644
--- a/src/jemalloc.c
+++ b/src/jemalloc.c
@@ -5,7 +5,11 @@
 /* Data. */
 
 /* Runtime configuration options. */
-const char	*je_malloc_conf JEMALLOC_ATTR(weak);
+const char	*je_malloc_conf
+#ifndef _WIN32
+    JEMALLOC_ATTR(weak)
+#endif
+    ;
 bool	opt_abort =
 #ifdef JEMALLOC_DEBUG
     true
@@ -85,14 +89,25 @@ enum {
 };
 static uint8_t	malloc_slow_flags;
 
-/* Last entry for overflow detection only.  */
 JEMALLOC_ALIGNED(CACHELINE)
-const size_t	index2size_tab[NSIZES+1] = {
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+const size_t	pind2sz_tab[NPSIZES] = {
+#define	PSZ_yes(lg_grp, ndelta, lg_delta)				\
+	(((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta))),
+#define	PSZ_no(lg_grp, ndelta, lg_delta)
+#define	SC(index, lg_grp, lg_delta, ndelta, psz, bin, lg_delta_lookup)	\
+	PSZ_##psz(lg_grp, ndelta, lg_delta)
+	SIZE_CLASSES
+#undef PSZ_yes
+#undef PSZ_no
+#undef SC
+};
+
+JEMALLOC_ALIGNED(CACHELINE)
+const size_t	index2size_tab[NSIZES] = {
+#define	SC(index, lg_grp, lg_delta, ndelta, psz, bin, lg_delta_lookup)	\
 	((ZU(1)<<lg_grp) + (ZU(ndelta)<<lg_delta)),
 	SIZE_CLASSES
 #undef SC
-	ZU(0)
 };
 
 JEMALLOC_ALIGNED(CACHELINE)
@@ -161,7 +176,7 @@ const uint8_t	size2index_tab[] = {
 #define	S2B_11(i)	S2B_10(i) S2B_10(i)
 #endif
 #define	S2B_no(i)
-#define	SC(index, lg_grp, lg_delta, ndelta, bin, lg_delta_lookup) \
+#define	SC(index, lg_grp, lg_delta, ndelta, psz, bin, lg_delta_lookup)	\
 	S2B_##lg_delta_lookup(index)
 	SIZE_CLASSES
 #undef S2B_3
@@ -329,6 +344,13 @@ a0idalloc(void *ptr, bool is_metadata)
 	idalloctm(TSDN_NULL, ptr, false, is_metadata, true);
 }
 
+arena_t *
+a0get(void)
+{
+
+	return (a0);
+}
+
 void *
 a0malloc(size_t size)
 {
@@ -455,15 +477,16 @@ arena_bind(tsd_t *tsd, unsigned ind, bool internal)
 {
 	arena_t *arena;
 
+	if (!tsd_nominal(tsd))
+		return;
+
 	arena = arena_get(tsd_tsdn(tsd), ind, false);
 	arena_nthreads_inc(arena, internal);
 
-	if (tsd_nominal(tsd)) {
-		if (internal)
-			tsd_iarena_set(tsd, arena);
-		else
-			tsd_arena_set(tsd, arena);
-	}
+	if (internal)
+		tsd_iarena_set(tsd, arena);
+	else
+		tsd_arena_set(tsd, arena);
 }
 
 void
@@ -789,6 +812,20 @@ malloc_ncpus(void)
 	SYSTEM_INFO si;
 	GetSystemInfo(&si);
 	result = si.dwNumberOfProcessors;
+#elif defined(JEMALLOC_GLIBC_MALLOC_HOOK) && defined(CPU_COUNT)
+	/*
+	 * glibc >= 2.6 has the CPU_COUNT macro.
+	 *
+	 * glibc's sysconf() uses isspace().  glibc allocates for the first time
+	 * *before* setting up the isspace tables.  Therefore we need a
+	 * different method to get the number of CPUs.
+	 */
+	{
+		cpu_set_t set;
+
+		pthread_getaffinity_np(pthread_self(), sizeof(set), &set);
+		result = CPU_COUNT(&set);
+	}
 #else
 	result = sysconf(_SC_NPROCESSORS_ONLN);
 #endif
@@ -1103,8 +1140,7 @@ malloc_conf_init(void)
 				for (i = 0; i < dss_prec_limit; i++) {
 					if (strncmp(dss_prec_names[i], v, vlen)
 					    == 0) {
-						if (chunk_dss_prec_set(NULL,
-						   i)) {
+						if (chunk_dss_prec_set(i)) {
 							malloc_conf_error(
 							    "Error setting dss",
 							    k, klen, v, vlen);
@@ -1149,9 +1185,20 @@ malloc_conf_init(void)
 			if (config_fill) {
 				if (CONF_MATCH("junk")) {
 					if (CONF_MATCH_VALUE("true")) {
-						opt_junk = "true";
-						opt_junk_alloc = opt_junk_free =
-						    true;
+						if (config_valgrind &&
+						    unlikely(in_valgrind)) {
+							malloc_conf_error(
+							"Deallocation-time "
+							"junk filling cannot "
+							"be enabled while "
+							"running inside "
+							"Valgrind", k, klen, v,
+							vlen);
+						} else {
+							opt_junk = "true";
+							opt_junk_alloc = true;
+							opt_junk_free = true;
+						}
 					} else if (CONF_MATCH_VALUE("false")) {
 						opt_junk = "false";
 						opt_junk_alloc = opt_junk_free =
@@ -1161,9 +1208,20 @@ malloc_conf_init(void)
 						opt_junk_alloc = true;
 						opt_junk_free = false;
 					} else if (CONF_MATCH_VALUE("free")) {
-						opt_junk = "free";
-						opt_junk_alloc = false;
-						opt_junk_free = true;
+						if (config_valgrind &&
+						    unlikely(in_valgrind)) {
+							malloc_conf_error(
+							"Deallocation-time "
+							"junk filling cannot "
+							"be enabled while "
+							"running inside "
+							"Valgrind", k, klen, v,
+							vlen);
+						} else {
+							opt_junk = "free";
+							opt_junk_alloc = false;
+							opt_junk_free = true;
+						}
 					} else {
 						malloc_conf_error(
 						    "Invalid conf value", k,
@@ -1249,11 +1307,14 @@ malloc_init_hard_needed(void)
 	}
 #ifdef JEMALLOC_THREADED_INIT
 	if (malloc_initializer != NO_INITIALIZER && !IS_INITIALIZER) {
+		spin_t spinner;
+
 		/* Busy-wait until the initializing thread completes. */
+		spin_init(&spinner);
 		do {
-			malloc_mutex_unlock(NULL, &init_lock);
-			CPU_SPINWAIT;
-			malloc_mutex_lock(NULL, &init_lock);
+			malloc_mutex_unlock(TSDN_NULL, &init_lock);
+			spin_adaptive(&spinner);
+			malloc_mutex_lock(TSDN_NULL, &init_lock);
 		} while (!malloc_initialized());
 		return (false);
 	}
@@ -1287,8 +1348,7 @@ malloc_init_hard_a0_locked()
 		return (true);
 	if (config_prof)
 		prof_boot1();
-	if (arena_boot())
-		return (true);
+	arena_boot();
 	if (config_tcache && tcache_boot(TSDN_NULL))
 		return (true);
 	if (malloc_mutex_init(&arenas_lock, "arenas", WITNESS_RANK_ARENAS))
@@ -1419,7 +1479,7 @@ malloc_init_hard(void)
 		return (true);
 	malloc_mutex_lock(tsd_tsdn(tsd), &init_lock);
 
-	if (config_prof && prof_boot2(tsd_tsdn(tsd))) {
+	if (config_prof && prof_boot2(tsd)) {
 		malloc_mutex_unlock(tsd_tsdn(tsd), &init_lock);
 		return (true);
 	}
@@ -1994,6 +2054,29 @@ JEMALLOC_EXPORT void *(*__realloc_hook)(void *ptr, size_t size) = je_realloc;
 JEMALLOC_EXPORT void *(*__memalign_hook)(size_t alignment, size_t size) =
     je_memalign;
 # endif
+
+#ifdef CPU_COUNT
+/*
+ * To enable static linking with glibc, the libc specific malloc interface must
+ * be implemented also, so none of glibc's malloc.o functions are added to the
+ * link.
+ */
+#define	ALIAS(je_fn)	__attribute__((alias (#je_fn), used))
+/* To force macro expansion of je_ prefix before stringification. */
+#define	PREALIAS(je_fn)  ALIAS(je_fn)
+void	*__libc_malloc(size_t size) PREALIAS(je_malloc);
+void	__libc_free(void* ptr) PREALIAS(je_free);
+void	*__libc_realloc(void* ptr, size_t size) PREALIAS(je_realloc);
+void	*__libc_calloc(size_t n, size_t size) PREALIAS(je_calloc);
+void	*__libc_memalign(size_t align, size_t s) PREALIAS(je_memalign);
+void	*__libc_valloc(size_t size) PREALIAS(je_valloc);
+int	__posix_memalign(void** r, size_t a, size_t s)
+    PREALIAS(je_posix_memalign);
+#undef PREALIAS
+#undef ALIAS
+
+#endif
+
 #endif
 
 /*
@@ -2747,7 +2830,6 @@ _malloc_prefork(void)
 		}
 	}
 	base_prefork(tsd_tsdn(tsd));
-	chunk_prefork(tsd_tsdn(tsd));
 	for (i = 0; i < narenas; i++) {
 		if ((arena = arena_get(tsd_tsdn(tsd), i, false)) != NULL)
 			arena_prefork3(tsd_tsdn(tsd), arena);
@@ -2776,7 +2858,6 @@ _malloc_postfork(void)
 
 	witness_postfork_parent(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	chunk_postfork_parent(tsd_tsdn(tsd));
 	base_postfork_parent(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
@@ -2801,7 +2882,6 @@ jemalloc_postfork_child(void)
 
 	witness_postfork_child(tsd);
 	/* Release all mutexes, now that fork() has completed. */
-	chunk_postfork_child(tsd_tsdn(tsd));
 	base_postfork_child(tsd_tsdn(tsd));
 	for (i = 0, narenas = narenas_total_get(); i < narenas; i++) {
 		arena_t *arena;
diff --git a/src/mutex.c b/src/mutex.c
index a1fac342..6333e73d 100644
--- a/src/mutex.c
+++ b/src/mutex.c
@@ -80,6 +80,8 @@ malloc_mutex_init(malloc_mutex_t *mutex, const char *name, witness_rank_t rank)
 	    _CRT_SPINCOUNT))
 		return (true);
 #  endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	mutex->lock = OS_UNFAIR_LOCK_INIT;
 #elif (defined(JEMALLOC_OSSPIN))
 	mutex->lock = 0;
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
diff --git a/src/nstime.c b/src/nstime.c
index aad2c260..0948e29f 100644
--- a/src/nstime.c
+++ b/src/nstime.c
@@ -97,6 +97,76 @@ nstime_divide(const nstime_t *time, const nstime_t *divisor)
 	return (time->ns / divisor->ns);
 }
 
+#ifdef _WIN32
+#  define NSTIME_MONOTONIC true
+static void
+nstime_get(nstime_t *time)
+{
+	FILETIME ft;
+	uint64_t ticks_100ns;
+
+	GetSystemTimeAsFileTime(&ft);
+	ticks_100ns = (((uint64_t)ft.dwHighDateTime) << 32) | ft.dwLowDateTime;
+
+	nstime_init(time, ticks_100ns * 100);
+}
+#elif JEMALLOC_HAVE_CLOCK_MONOTONIC_COARSE
+#  define NSTIME_MONOTONIC true
+static void
+nstime_get(nstime_t *time)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC_COARSE, &ts);
+	nstime_init2(time, ts.tv_sec, ts.tv_nsec);
+}
+#elif JEMALLOC_HAVE_CLOCK_MONOTONIC
+#  define NSTIME_MONOTONIC true
+static void
+nstime_get(nstime_t *time)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	nstime_init2(time, ts.tv_sec, ts.tv_nsec);
+}
+#elif JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+#  define NSTIME_MONOTONIC true
+static void
+nstime_get(nstime_t *time)
+{
+
+	nstime_init(time, mach_absolute_time());
+}
+#else
+#  define NSTIME_MONOTONIC false
+static void
+nstime_get(nstime_t *time)
+{
+	struct timeval tv;
+
+	gettimeofday(&tv, NULL);
+	nstime_init2(time, tv.tv_sec, tv.tv_usec * 1000);
+}
+#endif
+
+#ifdef JEMALLOC_JET
+#undef nstime_monotonic
+#define	nstime_monotonic JEMALLOC_N(n_nstime_monotonic)
+#endif
+bool
+nstime_monotonic(void)
+{
+
+	return (NSTIME_MONOTONIC);
+#undef NSTIME_MONOTONIC
+}
+#ifdef JEMALLOC_JET
+#undef nstime_monotonic
+#define	nstime_monotonic JEMALLOC_N(nstime_monotonic)
+nstime_monotonic_t *nstime_monotonic = JEMALLOC_N(n_nstime_monotonic);
+#endif
+
 #ifdef JEMALLOC_JET
 #undef nstime_update
 #define	nstime_update JEMALLOC_N(n_nstime_update)
@@ -107,33 +177,7 @@ nstime_update(nstime_t *time)
 	nstime_t old_time;
 
 	nstime_copy(&old_time, time);
-
-#ifdef _WIN32
-	{
-		FILETIME ft;
-		uint64_t ticks;
-		GetSystemTimeAsFileTime(&ft);
-		ticks = (((uint64_t)ft.dwHighDateTime) << 32) |
-		    ft.dwLowDateTime;
-		time->ns = ticks * 100;
-	}
-#elif JEMALLOC_CLOCK_GETTIME
-	{
-		struct timespec ts;
-
-		if (sysconf(_SC_MONOTONIC_CLOCK) > 0)
-			clock_gettime(CLOCK_MONOTONIC, &ts);
-		else
-			clock_gettime(CLOCK_REALTIME, &ts);
-		time->ns = ts.tv_sec * BILLION + ts.tv_nsec;
-	}
-#else
-	{
-		struct timeval tv;
-		gettimeofday(&tv, NULL);
-		time->ns = tv.tv_sec * BILLION + tv.tv_usec * 1000;
-	}
-#endif
+	nstime_get(time);
 
 	/* Handle non-monotonic clocks. */
 	if (unlikely(nstime_compare(&old_time, time) > 0)) {
diff --git a/src/pages.c b/src/pages.c
index 2a9b7e37..647952ac 100644
--- a/src/pages.c
+++ b/src/pages.c
@@ -207,6 +207,11 @@ os_overcommits_sysctl(void)
 #endif
 
 #ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
+/*
+ * Use syscall(2) rather than {open,read,close}(2) when possible to avoid
+ * reentry during bootstrapping if another library has interposed system call
+ * wrappers.
+ */
 static bool
 os_overcommits_proc(void)
 {
@@ -214,11 +219,26 @@ os_overcommits_proc(void)
 	char buf[1];
 	ssize_t nread;
 
+#if defined(JEMALLOC_HAVE_SYSCALL) && defined(SYS_open)
+	fd = (int)syscall(SYS_open, "/proc/sys/vm/overcommit_memory", O_RDONLY);
+#else
 	fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY);
+#endif
 	if (fd == -1)
 		return (false); /* Error. */
 
+#if defined(JEMALLOC_HAVE_SYSCALL) && defined(SYS_read)
+	nread = (ssize_t)syscall(SYS_read, fd, &buf, sizeof(buf));
+#else
 	nread = read(fd, &buf, sizeof(buf));
+#endif
+
+#if defined(JEMALLOC_HAVE_SYSCALL) && defined(SYS_close)
+	syscall(SYS_close, fd);
+#else
+	close(fd);
+#endif
+
 	if (nread < 1)
 		return (false); /* Error. */
 	/*
diff --git a/src/prof.c b/src/prof.c
index c1f58d46..140d5b22 100644
--- a/src/prof.c
+++ b/src/prof.c
@@ -125,7 +125,7 @@ static bool	prof_tctx_should_destroy(tsdn_t *tsdn, prof_tctx_t *tctx);
 static void	prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx);
 static bool	prof_tdata_should_destroy(tsdn_t *tsdn, prof_tdata_t *tdata,
     bool even_if_attached);
-static void	prof_tdata_destroy(tsdn_t *tsdn, prof_tdata_t *tdata,
+static void	prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached);
 static char	*prof_thread_name_alloc(tsdn_t *tsdn, const char *thread_name);
 
@@ -591,7 +591,7 @@ prof_gctx_try_destroy(tsd_t *tsd, prof_tdata_t *tdata_self, prof_gctx_t *gctx,
 	assert(gctx->nlimbo != 0);
 	if (tctx_tree_empty(&gctx->tctxs) && gctx->nlimbo == 1) {
 		/* Remove gctx from bt2gctx. */
-		if (ckh_remove(tsd_tsdn(tsd), &bt2gctx, &gctx->bt, NULL, NULL))
+		if (ckh_remove(tsd, &bt2gctx, &gctx->bt, NULL, NULL))
 			not_reached();
 		prof_leave(tsd, tdata_self);
 		/* Destroy gctx. */
@@ -651,7 +651,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 	assert(tctx->cnts.accumobjs == 0);
 	assert(tctx->cnts.accumbytes == 0);
 
-	ckh_remove(tsd_tsdn(tsd), &tdata->bt2tctx, &gctx->bt, NULL, NULL);
+	ckh_remove(tsd, &tdata->bt2tctx, &gctx->bt, NULL, NULL);
 	destroy_tdata = prof_tdata_should_destroy(tsd_tsdn(tsd), tdata, false);
 	malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock);
 
@@ -704,7 +704,7 @@ prof_tctx_destroy(tsd_t *tsd, prof_tctx_t *tctx)
 	malloc_mutex_assert_not_owner(tsd_tsdn(tsd), tctx->tdata->lock);
 
 	if (destroy_tdata)
-		prof_tdata_destroy(tsd_tsdn(tsd), tdata, false);
+		prof_tdata_destroy(tsd, tdata, false);
 
 	if (destroy_tctx)
 		idalloctm(tsd_tsdn(tsd), tctx, NULL, true, true);
@@ -733,7 +733,7 @@ prof_lookup_global(tsd_t *tsd, prof_bt_t *bt, prof_tdata_t *tdata,
 			return (true);
 		}
 		btkey.p = &gctx.p->bt;
-		if (ckh_insert(tsd_tsdn(tsd), &bt2gctx, btkey.v, gctx.v)) {
+		if (ckh_insert(tsd, &bt2gctx, btkey.v, gctx.v)) {
 			/* OOM. */
 			prof_leave(tsd, tdata);
 			idalloctm(tsd_tsdn(tsd), gctx.v, NULL, true, true);
@@ -795,7 +795,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		/* Link a prof_tctx_t into gctx for this thread. */
 		ret.v = iallocztm(tsd_tsdn(tsd), sizeof(prof_tctx_t),
 		    size2index(sizeof(prof_tctx_t)), false, NULL, true,
-		    arena_ichoose(tsd_tsdn(tsd), NULL), true);
+		    arena_ichoose(tsd, NULL), true);
 		if (ret.p == NULL) {
 			if (new_gctx)
 				prof_gctx_try_destroy(tsd, tdata, gctx, tdata);
@@ -810,8 +810,7 @@ prof_lookup(tsd_t *tsd, prof_bt_t *bt)
 		ret.p->prepared = true;
 		ret.p->state = prof_tctx_state_initializing;
 		malloc_mutex_lock(tsd_tsdn(tsd), tdata->lock);
-		error = ckh_insert(tsd_tsdn(tsd), &tdata->bt2tctx, btkey,
-		    ret.v);
+		error = ckh_insert(tsd, &tdata->bt2tctx, btkey, ret.v);
 		malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock);
 		if (error) {
 			if (new_gctx)
@@ -1791,7 +1790,7 @@ prof_thr_uid_alloc(tsdn_t *tsdn)
 }
 
 static prof_tdata_t *
-prof_tdata_init_impl(tsdn_t *tsdn, uint64_t thr_uid, uint64_t thr_discrim,
+prof_tdata_init_impl(tsd_t *tsd, uint64_t thr_uid, uint64_t thr_discrim,
     char *thread_name, bool active)
 {
 	prof_tdata_t *tdata;
@@ -1799,7 +1798,7 @@ prof_tdata_init_impl(tsdn_t *tsdn, uint64_t thr_uid, uint64_t thr_discrim,
 	cassert(config_prof);
 
 	/* Initialize an empty cache for this thread. */
-	tdata = (prof_tdata_t *)iallocztm(tsdn, sizeof(prof_tdata_t),
+	tdata = (prof_tdata_t *)iallocztm(tsd_tsdn(tsd), sizeof(prof_tdata_t),
 	    size2index(sizeof(prof_tdata_t)), false, NULL, true,
 	    arena_get(TSDN_NULL, 0, true), true);
 	if (tdata == NULL)
@@ -1813,9 +1812,9 @@ prof_tdata_init_impl(tsdn_t *tsdn, uint64_t thr_uid, uint64_t thr_discrim,
 	tdata->expired = false;
 	tdata->tctx_uid_next = 0;
 
-	if (ckh_new(tsdn, &tdata->bt2tctx, PROF_CKH_MINITEMS,
-	    prof_bt_hash, prof_bt_keycomp)) {
-		idalloctm(tsdn, tdata, NULL, true, true);
+	if (ckh_new(tsd, &tdata->bt2tctx, PROF_CKH_MINITEMS, prof_bt_hash,
+	    prof_bt_keycomp)) {
+		idalloctm(tsd_tsdn(tsd), tdata, NULL, true, true);
 		return (NULL);
 	}
 
@@ -1829,19 +1828,19 @@ prof_tdata_init_impl(tsdn_t *tsdn, uint64_t thr_uid, uint64_t thr_discrim,
 	tdata->dumping = false;
 	tdata->active = active;
 
-	malloc_mutex_lock(tsdn, &tdatas_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx);
 	tdata_tree_insert(&tdatas, tdata);
-	malloc_mutex_unlock(tsdn, &tdatas_mtx);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx);
 
 	return (tdata);
 }
 
 prof_tdata_t *
-prof_tdata_init(tsdn_t *tsdn)
+prof_tdata_init(tsd_t *tsd)
 {
 
-	return (prof_tdata_init_impl(tsdn, prof_thr_uid_alloc(tsdn), 0, NULL,
-	    prof_thread_active_init_get(tsdn)));
+	return (prof_tdata_init_impl(tsd, prof_thr_uid_alloc(tsd_tsdn(tsd)), 0,
+	    NULL, prof_thread_active_init_get(tsd_tsdn(tsd))));
 }
 
 static bool
@@ -1866,31 +1865,29 @@ prof_tdata_should_destroy(tsdn_t *tsdn, prof_tdata_t *tdata,
 }
 
 static void
-prof_tdata_destroy_locked(tsdn_t *tsdn, prof_tdata_t *tdata,
+prof_tdata_destroy_locked(tsd_t *tsd, prof_tdata_t *tdata,
     bool even_if_attached)
 {
 
-	malloc_mutex_assert_owner(tsdn, &tdatas_mtx);
-
-	assert(tsdn_null(tsdn) || tsd_prof_tdata_get(tsdn_tsd(tsdn)) != tdata);
+	malloc_mutex_assert_owner(tsd_tsdn(tsd), &tdatas_mtx);
 
 	tdata_tree_remove(&tdatas, tdata);
 
 	assert(prof_tdata_should_destroy_unlocked(tdata, even_if_attached));
 
 	if (tdata->thread_name != NULL)
-		idalloctm(tsdn, tdata->thread_name, NULL, true, true);
-	ckh_delete(tsdn, &tdata->bt2tctx);
-	idalloctm(tsdn, tdata, NULL, true, true);
+		idalloctm(tsd_tsdn(tsd), tdata->thread_name, NULL, true, true);
+	ckh_delete(tsd, &tdata->bt2tctx);
+	idalloctm(tsd_tsdn(tsd), tdata, NULL, true, true);
 }
 
 static void
-prof_tdata_destroy(tsdn_t *tsdn, prof_tdata_t *tdata, bool even_if_attached)
+prof_tdata_destroy(tsd_t *tsd, prof_tdata_t *tdata, bool even_if_attached)
 {
 
-	malloc_mutex_lock(tsdn, &tdatas_mtx);
-	prof_tdata_destroy_locked(tsdn, tdata, even_if_attached);
-	malloc_mutex_unlock(tsdn, &tdatas_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx);
+	prof_tdata_destroy_locked(tsd, tdata, even_if_attached);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx);
 }
 
 static void
@@ -1913,7 +1910,7 @@ prof_tdata_detach(tsd_t *tsd, prof_tdata_t *tdata)
 		destroy_tdata = false;
 	malloc_mutex_unlock(tsd_tsdn(tsd), tdata->lock);
 	if (destroy_tdata)
-		prof_tdata_destroy(tsd_tsdn(tsd), tdata, true);
+		prof_tdata_destroy(tsd, tdata, true);
 }
 
 prof_tdata_t *
@@ -1926,8 +1923,8 @@ prof_tdata_reinit(tsd_t *tsd, prof_tdata_t *tdata)
 	bool active = tdata->active;
 
 	prof_tdata_detach(tsd, tdata);
-	return (prof_tdata_init_impl(tsd_tsdn(tsd), thr_uid, thr_discrim,
-	    thread_name, active));
+	return (prof_tdata_init_impl(tsd, thr_uid, thr_discrim, thread_name,
+	    active));
 }
 
 static bool
@@ -1956,30 +1953,30 @@ prof_tdata_reset_iter(prof_tdata_tree_t *tdatas, prof_tdata_t *tdata, void *arg)
 }
 
 void
-prof_reset(tsdn_t *tsdn, size_t lg_sample)
+prof_reset(tsd_t *tsd, size_t lg_sample)
 {
 	prof_tdata_t *next;
 
 	assert(lg_sample < (sizeof(uint64_t) << 3));
 
-	malloc_mutex_lock(tsdn, &prof_dump_mtx);
-	malloc_mutex_lock(tsdn, &tdatas_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &prof_dump_mtx);
+	malloc_mutex_lock(tsd_tsdn(tsd), &tdatas_mtx);
 
 	lg_prof_sample = lg_sample;
 
 	next = NULL;
 	do {
 		prof_tdata_t *to_destroy = tdata_tree_iter(&tdatas, next,
-		    prof_tdata_reset_iter, (void *)tsdn);
+		    prof_tdata_reset_iter, (void *)tsd);
 		if (to_destroy != NULL) {
 			next = tdata_tree_next(&tdatas, to_destroy);
-			prof_tdata_destroy_locked(tsdn, to_destroy, false);
+			prof_tdata_destroy_locked(tsd, to_destroy, false);
 		} else
 			next = NULL;
 	} while (next != NULL);
 
-	malloc_mutex_unlock(tsdn, &tdatas_mtx);
-	malloc_mutex_unlock(tsdn, &prof_dump_mtx);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &tdatas_mtx);
+	malloc_mutex_unlock(tsd_tsdn(tsd), &prof_dump_mtx);
 }
 
 void
@@ -2189,7 +2186,7 @@ prof_boot1(void)
 }
 
 bool
-prof_boot2(tsdn_t *tsdn)
+prof_boot2(tsd_t *tsd)
 {
 
 	cassert(config_prof);
@@ -2215,7 +2212,7 @@ prof_boot2(tsdn_t *tsdn)
 		    WITNESS_RANK_PROF_THREAD_ACTIVE_INIT))
 			return (true);
 
-		if (ckh_new(tsdn, &bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
+		if (ckh_new(tsd, &bt2gctx, PROF_CKH_MINITEMS, prof_bt_hash,
 		    prof_bt_keycomp))
 			return (true);
 		if (malloc_mutex_init(&bt2gctx_mtx, "prof_bt2gctx",
@@ -2246,8 +2243,8 @@ prof_boot2(tsdn_t *tsdn)
 				abort();
 		}
 
-		gctx_locks = (malloc_mutex_t *)base_alloc(tsdn, PROF_NCTX_LOCKS
-		    * sizeof(malloc_mutex_t));
+		gctx_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
+		    PROF_NCTX_LOCKS * sizeof(malloc_mutex_t));
 		if (gctx_locks == NULL)
 			return (true);
 		for (i = 0; i < PROF_NCTX_LOCKS; i++) {
@@ -2256,7 +2253,7 @@ prof_boot2(tsdn_t *tsdn)
 				return (true);
 		}
 
-		tdata_locks = (malloc_mutex_t *)base_alloc(tsdn,
+		tdata_locks = (malloc_mutex_t *)base_alloc(tsd_tsdn(tsd),
 		    PROF_NTDATA_LOCKS * sizeof(malloc_mutex_t));
 		if (tdata_locks == NULL)
 			return (true);
diff --git a/src/rtree.c b/src/rtree.c
index 3166b45f..f2e2997d 100644
--- a/src/rtree.c
+++ b/src/rtree.c
@@ -96,12 +96,15 @@ rtree_node_init(rtree_t *rtree, unsigned level, rtree_node_elm_t **elmp)
 	rtree_node_elm_t *node;
 
 	if (atomic_cas_p((void **)elmp, NULL, RTREE_NODE_INITIALIZING)) {
+		spin_t spinner;
+
 		/*
 		 * Another thread is already in the process of initializing.
 		 * Spin-wait until initialization is complete.
 		 */
+		spin_init(&spinner);
 		do {
-			CPU_SPINWAIT;
+			spin_adaptive(&spinner);
 			node = atomic_read_p((void **)elmp);
 		} while (node == RTREE_NODE_INITIALIZING);
 	} else {
@@ -125,5 +128,5 @@ rtree_node_elm_t *
 rtree_child_read_hard(rtree_t *rtree, rtree_node_elm_t *elm, unsigned level)
 {
 
-	return (rtree_node_init(rtree, level, &elm->child));
+	return (rtree_node_init(rtree, level+1, &elm->child));
 }
diff --git a/src/spin.c b/src/spin.c
new file mode 100644
index 00000000..5242d95a
--- /dev/null
+++ b/src/spin.c
@@ -0,0 +1,2 @@
+#define	JEMALLOC_SPIN_C_
+#include "jemalloc/internal/jemalloc_internal.h"
diff --git a/src/stats.c b/src/stats.c
index 073be4fe..bd8af399 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -32,86 +32,107 @@ bool	opt_stats_print = false;
 
 size_t	stats_cactive = 0;
 
-/******************************************************************************/
-/* Function prototypes for non-inline static functions. */
-
-static void	stats_arena_bins_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i);
-static void	stats_arena_lruns_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i);
-static void	stats_arena_hchunks_print(
-    void (*write_cb)(void *, const char *), void *cbopaque, unsigned i);
-static void	stats_arena_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i, bool bins, bool large, bool huge);
-
 /******************************************************************************/
 
 static void
 stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i)
+    bool json, bool large, bool huge, unsigned i)
 {
 	size_t page;
-	bool config_tcache, in_gap;
+	bool config_tcache, in_gap, in_gap_prev;
 	unsigned nbins, j;
 
 	CTL_GET("arenas.page", &page, size_t);
 
-	CTL_GET("config.tcache", &config_tcache, bool);
-	if (config_tcache) {
-		malloc_cprintf(write_cb, cbopaque,
-		    "bins:           size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests      curregs      curruns regs"
-		    " pgs  util       nfills     nflushes      newruns"
-		    "       reruns\n");
-	} else {
-		malloc_cprintf(write_cb, cbopaque,
-		    "bins:           size ind    allocated      nmalloc"
-		    "      ndalloc    nrequests      curregs      curruns regs"
-		    " pgs  util      newruns       reruns\n");
-	}
 	CTL_GET("arenas.nbins", &nbins, unsigned);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"bins\": [\n");
+	} else {
+		CTL_GET("config.tcache", &config_tcache, bool);
+		if (config_tcache) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "bins:           size ind    allocated      nmalloc"
+			    "      ndalloc    nrequests      curregs"
+			    "      curruns regs pgs  util       nfills"
+			    "     nflushes      newruns       reruns\n");
+		} else {
+			malloc_cprintf(write_cb, cbopaque,
+			    "bins:           size ind    allocated      nmalloc"
+			    "      ndalloc    nrequests      curregs"
+			    "      curruns regs pgs  util      newruns"
+			    "       reruns\n");
+		}
+	}
 	for (j = 0, in_gap = false; j < nbins; j++) {
 		uint64_t nruns;
+		size_t reg_size, run_size, curregs;
+		size_t curruns;
+		uint32_t nregs;
+		uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
+		uint64_t nreruns;
 
 		CTL_M2_M4_GET("stats.arenas.0.bins.0.nruns", i, j, &nruns,
 		    uint64_t);
-		if (nruns == 0)
-			in_gap = true;
-		else {
-			size_t reg_size, run_size, curregs, availregs, milli;
-			size_t curruns;
-			uint32_t nregs;
-			uint64_t nmalloc, ndalloc, nrequests, nfills, nflushes;
-			uint64_t reruns;
-			char util[6]; /* "x.yyy". */
+		in_gap_prev = in_gap;
+		in_gap = (nruns == 0);
 
-			if (in_gap) {
-				malloc_cprintf(write_cb, cbopaque,
-				    "                     ---\n");
-				in_gap = false;
-			}
-			CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
-			CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
-			CTL_M2_GET("arenas.bin.0.run_size", j, &run_size,
-			    size_t);
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j,
-			    &nmalloc, uint64_t);
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.ndalloc", i, j,
-			    &ndalloc, uint64_t);
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.curregs", i, j,
-			    &curregs, size_t);
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.nrequests", i, j,
-			    &nrequests, uint64_t);
+		if (!json && in_gap_prev && !in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
+
+		CTL_M2_GET("arenas.bin.0.size", j, &reg_size, size_t);
+		CTL_M2_GET("arenas.bin.0.nregs", j, &nregs, uint32_t);
+		CTL_M2_GET("arenas.bin.0.run_size", j, &run_size, size_t);
+
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nmalloc", i, j, &nmalloc,
+		    uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.ndalloc", i, j, &ndalloc,
+		    uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.curregs", i, j, &curregs,
+		    size_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nrequests", i, j,
+		    &nrequests, uint64_t);
+		if (config_tcache) {
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nfills", i, j,
+			    &nfills, uint64_t);
+			CTL_M2_M4_GET("stats.arenas.0.bins.0.nflushes", i, j,
+			    &nflushes, uint64_t);
+		}
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.nreruns", i, j, &nreruns,
+		    uint64_t);
+		CTL_M2_M4_GET("stats.arenas.0.bins.0.curruns", i, j, &curruns,
+		    size_t);
+
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t{\n"
+			    "\t\t\t\t\t\t\"nmalloc\": %"FMTu64",\n"
+			    "\t\t\t\t\t\t\"ndalloc\": %"FMTu64",\n"
+			    "\t\t\t\t\t\t\"curregs\": %zu,\n"
+			    "\t\t\t\t\t\t\"nrequests\": %"FMTu64",\n",
+			    nmalloc,
+			    ndalloc,
+			    curregs,
+			    nrequests);
 			if (config_tcache) {
-				CTL_M2_M4_GET("stats.arenas.0.bins.0.nfills", i,
-				    j, &nfills, uint64_t);
-				CTL_M2_M4_GET("stats.arenas.0.bins.0.nflushes",
-				    i, j, &nflushes, uint64_t);
+				malloc_cprintf(write_cb, cbopaque,
+				    "\t\t\t\t\t\t\"nfills\": %"FMTu64",\n"
+				    "\t\t\t\t\t\t\"nflushes\": %"FMTu64",\n",
+				    nfills,
+				    nflushes);
 			}
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.nreruns", i, j,
-			    &reruns, uint64_t);
-			CTL_M2_M4_GET("stats.arenas.0.bins.0.curruns", i, j,
-			    &curruns, size_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\t\"nreruns\": %"FMTu64",\n"
+			    "\t\t\t\t\t\t\"curruns\": %zu\n"
+			    "\t\t\t\t\t}%s\n",
+			    nreruns,
+			    curruns,
+			    (j + 1 < nbins) ? "," : "");
+		} else if (!in_gap) {
+			size_t availregs, milli;
+			char util[6]; /* "x.yyy". */
 
 			availregs = nregs * curruns;
 			milli = (availregs != 0) ? (1000 * curregs) / availregs
@@ -138,7 +159,7 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
 				    run_size / page, util, nfills, nflushes,
-				    nruns, reruns);
+				    nruns, nreruns);
 			} else {
 				malloc_cprintf(write_cb, cbopaque,
 				    "%20zu %3u %12zu %12"FMTu64
@@ -147,28 +168,38 @@ stats_arena_bins_print(void (*write_cb)(void *, const char *), void *cbopaque,
 				    " %12"FMTu64"\n",
 				    reg_size, j, curregs * reg_size, nmalloc,
 				    ndalloc, nrequests, curregs, curruns, nregs,
-				    run_size / page, util, nruns, reruns);
+				    run_size / page, util, nruns, nreruns);
 			}
 		}
 	}
-	if (in_gap) {
+	if (json) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "                     ---\n");
+		    "\t\t\t\t]%s\n", (large || huge) ? "," : "");
+	} else {
+		if (in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
 	}
 }
 
 static void
 stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i)
+    bool json, bool huge, unsigned i)
 {
 	unsigned nbins, nlruns, j;
-	bool in_gap;
+	bool in_gap, in_gap_prev;
 
-	malloc_cprintf(write_cb, cbopaque,
-	    "large:          size ind    allocated      nmalloc      ndalloc"
-	    "    nrequests      curruns\n");
 	CTL_GET("arenas.nbins", &nbins, unsigned);
 	CTL_GET("arenas.nlruns", &nlruns, unsigned);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"lruns\": [\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "large:          size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests      curruns\n");
+	}
 	for (j = 0, in_gap = false; j < nlruns; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t run_size, curruns;
@@ -179,17 +210,25 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		    uint64_t);
 		CTL_M2_M4_GET("stats.arenas.0.lruns.0.nrequests", i, j,
 		    &nrequests, uint64_t);
-		if (nrequests == 0)
-			in_gap = true;
-		else {
-			CTL_M2_GET("arenas.lrun.0.size", j, &run_size, size_t);
-			CTL_M2_M4_GET("stats.arenas.0.lruns.0.curruns", i, j,
-			    &curruns, size_t);
-			if (in_gap) {
-				malloc_cprintf(write_cb, cbopaque,
-				    "                     ---\n");
-				in_gap = false;
-			}
+		in_gap_prev = in_gap;
+		in_gap = (nrequests == 0);
+
+		if (!json && in_gap_prev && !in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
+
+		CTL_M2_GET("arenas.lrun.0.size", j, &run_size, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.lruns.0.curruns", i, j, &curruns,
+		    size_t);
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t{\n"
+			    "\t\t\t\t\t\t\"curruns\": %zu\n"
+			    "\t\t\t\t\t}%s\n",
+			    curruns,
+			    (j + 1 < nlruns) ? "," : "");
+		} else if (!in_gap) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
 			    " %12"FMTu64" %12zu\n",
@@ -197,25 +236,35 @@ stats_arena_lruns_print(void (*write_cb)(void *, const char *), void *cbopaque,
 			    ndalloc, nrequests, curruns);
 		}
 	}
-	if (in_gap) {
+	if (json) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "                     ---\n");
+		    "\t\t\t\t]%s\n", huge ? "," : "");
+	} else {
+		if (in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
 	}
 }
 
 static void
 stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
-    void *cbopaque, unsigned i)
+    void *cbopaque, bool json, unsigned i)
 {
 	unsigned nbins, nlruns, nhchunks, j;
-	bool in_gap;
+	bool in_gap, in_gap_prev;
 
-	malloc_cprintf(write_cb, cbopaque,
-	    "huge:           size ind    allocated      nmalloc      ndalloc"
-	    "    nrequests   curhchunks\n");
 	CTL_GET("arenas.nbins", &nbins, unsigned);
 	CTL_GET("arenas.nlruns", &nlruns, unsigned);
 	CTL_GET("arenas.nhchunks", &nhchunks, unsigned);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"hchunks\": [\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "huge:           size ind    allocated      nmalloc"
+		    "      ndalloc    nrequests   curhchunks\n");
+	}
 	for (j = 0, in_gap = false; j < nhchunks; j++) {
 		uint64_t nmalloc, ndalloc, nrequests;
 		size_t hchunk_size, curhchunks;
@@ -226,18 +275,25 @@ stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
 		    &ndalloc, uint64_t);
 		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.nrequests", i, j,
 		    &nrequests, uint64_t);
-		if (nrequests == 0)
-			in_gap = true;
-		else {
-			CTL_M2_GET("arenas.hchunk.0.size", j, &hchunk_size,
-			    size_t);
-			CTL_M2_M4_GET("stats.arenas.0.hchunks.0.curhchunks", i,
-			    j, &curhchunks, size_t);
-			if (in_gap) {
-				malloc_cprintf(write_cb, cbopaque,
-				    "                     ---\n");
-				in_gap = false;
-			}
+		in_gap_prev = in_gap;
+		in_gap = (nrequests == 0);
+
+		if (!json && in_gap_prev && !in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
+
+		CTL_M2_GET("arenas.hchunk.0.size", j, &hchunk_size, size_t);
+		CTL_M2_M4_GET("stats.arenas.0.hchunks.0.curhchunks", i, j,
+		    &curhchunks, size_t);
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t{\n"
+			    "\t\t\t\t\t\t\"curhchunks\": %zu\n"
+			    "\t\t\t\t\t}%s\n",
+			    curhchunks,
+			    (j + 1 < nhchunks) ? "," : "");
+		} else if (!in_gap) {
 			malloc_cprintf(write_cb, cbopaque,
 			    "%20zu %3u %12zu %12"FMTu64" %12"FMTu64
 			    " %12"FMTu64" %12zu\n",
@@ -246,15 +302,20 @@ stats_arena_hchunks_print(void (*write_cb)(void *, const char *),
 			    nrequests, curhchunks);
 		}
 	}
-	if (in_gap) {
+	if (json) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "                     ---\n");
+		    "\t\t\t\t]\n");
+	} else {
+		if (in_gap) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "                     ---\n");
+		}
 	}
 }
 
 static void
 stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
-    unsigned i, bool bins, bool large, bool huge)
+    bool json, unsigned i, bool bins, bool large, bool huge)
 {
 	unsigned nthreads;
 	const char *dss;
@@ -272,100 +333,731 @@ stats_arena_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	CTL_GET("arenas.page", &page, size_t);
 
 	CTL_M2_GET("stats.arenas.0.nthreads", i, &nthreads, unsigned);
-	malloc_cprintf(write_cb, cbopaque,
-	    "assigned threads: %u\n", nthreads);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"nthreads\": %u,\n", nthreads);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "assigned threads: %u\n", nthreads);
+	}
+
 	CTL_M2_GET("stats.arenas.0.dss", i, &dss, const char *);
-	malloc_cprintf(write_cb, cbopaque, "dss allocation precedence: %s\n",
-	    dss);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"dss\": \"%s\",\n", dss);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "dss allocation precedence: %s\n", dss);
+	}
+
 	CTL_M2_GET("stats.arenas.0.lg_dirty_mult", i, &lg_dirty_mult, ssize_t);
-	if (opt_purge == purge_mode_ratio) {
-		if (lg_dirty_mult >= 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "min active:dirty page ratio: %u:1\n",
-			    (1U << lg_dirty_mult));
-		} else {
-			malloc_cprintf(write_cb, cbopaque,
-			    "min active:dirty page ratio: N/A\n");
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"lg_dirty_mult\": %zd,\n", lg_dirty_mult);
+	} else {
+		if (opt_purge == purge_mode_ratio) {
+			if (lg_dirty_mult >= 0) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "min active:dirty page ratio: %u:1\n",
+				    (1U << lg_dirty_mult));
+			} else {
+				malloc_cprintf(write_cb, cbopaque,
+				    "min active:dirty page ratio: N/A\n");
+			}
 		}
 	}
+
 	CTL_M2_GET("stats.arenas.0.decay_time", i, &decay_time, ssize_t);
-	if (opt_purge == purge_mode_decay) {
-		if (decay_time >= 0) {
-			malloc_cprintf(write_cb, cbopaque, "decay time: %zd\n",
-			    decay_time);
-		} else
-			malloc_cprintf(write_cb, cbopaque, "decay time: N/A\n");
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"decay_time\": %zd,\n", decay_time);
+	} else {
+		if (opt_purge == purge_mode_decay) {
+			if (decay_time >= 0) {
+				malloc_cprintf(write_cb, cbopaque,
+				    "decay time: %zd\n", decay_time);
+			} else {
+				malloc_cprintf(write_cb, cbopaque,
+				    "decay time: N/A\n");
+			}
+		}
 	}
+
 	CTL_M2_GET("stats.arenas.0.pactive", i, &pactive, size_t);
 	CTL_M2_GET("stats.arenas.0.pdirty", i, &pdirty, size_t);
 	CTL_M2_GET("stats.arenas.0.npurge", i, &npurge, uint64_t);
 	CTL_M2_GET("stats.arenas.0.nmadvise", i, &nmadvise, uint64_t);
 	CTL_M2_GET("stats.arenas.0.purged", i, &purged, uint64_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "purging: dirty: %zu, sweeps: %"FMTu64", madvises: %"FMTu64", "
-	    "purged: %"FMTu64"\n", pdirty, npurge, nmadvise, purged);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"pactive\": %zu,\n", pactive);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"pdirty\": %zu,\n", pdirty);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"npurge\": %"FMTu64",\n", npurge);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"nmadvise\": %"FMTu64",\n", nmadvise);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"purged\": %"FMTu64",\n", purged);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "purging: dirty: %zu, sweeps: %"FMTu64", madvises: %"FMTu64
+		    ", purged: %"FMTu64"\n", pdirty, npurge, nmadvise, purged);
+	}
 
-	malloc_cprintf(write_cb, cbopaque,
-	    "                            allocated      nmalloc      ndalloc"
-	    "    nrequests\n");
 	CTL_M2_GET("stats.arenas.0.small.allocated", i, &small_allocated,
 	    size_t);
 	CTL_M2_GET("stats.arenas.0.small.nmalloc", i, &small_nmalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.small.ndalloc", i, &small_ndalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.small.nrequests", i, &small_nrequests,
 	    uint64_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "small:                   %12zu %12"FMTu64" %12"FMTu64
-	    " %12"FMTu64"\n",
-	    small_allocated, small_nmalloc, small_ndalloc, small_nrequests);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"small\": {\n");
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"allocated\": %zu,\n", small_allocated);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nmalloc\": %"FMTu64",\n", small_nmalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"ndalloc\": %"FMTu64",\n", small_ndalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nrequests\": %"FMTu64"\n", small_nrequests);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t},\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "                            allocated      nmalloc"
+		    "      ndalloc    nrequests\n");
+		malloc_cprintf(write_cb, cbopaque,
+		    "small:                   %12zu %12"FMTu64" %12"FMTu64
+		    " %12"FMTu64"\n",
+		    small_allocated, small_nmalloc, small_ndalloc,
+		    small_nrequests);
+	}
+
 	CTL_M2_GET("stats.arenas.0.large.allocated", i, &large_allocated,
 	    size_t);
 	CTL_M2_GET("stats.arenas.0.large.nmalloc", i, &large_nmalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.large.ndalloc", i, &large_ndalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.large.nrequests", i, &large_nrequests,
 	    uint64_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "large:                   %12zu %12"FMTu64" %12"FMTu64
-	    " %12"FMTu64"\n",
-	    large_allocated, large_nmalloc, large_ndalloc, large_nrequests);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"large\": {\n");
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"allocated\": %zu,\n", large_allocated);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nmalloc\": %"FMTu64",\n", large_nmalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"ndalloc\": %"FMTu64",\n", large_ndalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nrequests\": %"FMTu64"\n", large_nrequests);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t},\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "large:                   %12zu %12"FMTu64" %12"FMTu64
+		    " %12"FMTu64"\n",
+		    large_allocated, large_nmalloc, large_ndalloc,
+		    large_nrequests);
+	}
+
 	CTL_M2_GET("stats.arenas.0.huge.allocated", i, &huge_allocated, size_t);
 	CTL_M2_GET("stats.arenas.0.huge.nmalloc", i, &huge_nmalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.huge.ndalloc", i, &huge_ndalloc, uint64_t);
 	CTL_M2_GET("stats.arenas.0.huge.nrequests", i, &huge_nrequests,
 	    uint64_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "huge:                    %12zu %12"FMTu64" %12"FMTu64
-	    " %12"FMTu64"\n",
-	    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
-	malloc_cprintf(write_cb, cbopaque,
-	    "total:                   %12zu %12"FMTu64" %12"FMTu64
-	    " %12"FMTu64"\n",
-	    small_allocated + large_allocated + huge_allocated,
-	    small_nmalloc + large_nmalloc + huge_nmalloc,
-	    small_ndalloc + large_ndalloc + huge_ndalloc,
-	    small_nrequests + large_nrequests + huge_nrequests);
-	malloc_cprintf(write_cb, cbopaque,
-	    "active:                  %12zu\n", pactive * page);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"huge\": {\n");
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"allocated\": %zu,\n", huge_allocated);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nmalloc\": %"FMTu64",\n", huge_nmalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"ndalloc\": %"FMTu64",\n", huge_ndalloc);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"nrequests\": %"FMTu64"\n", huge_nrequests);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t},\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "huge:                    %12zu %12"FMTu64" %12"FMTu64
+		    " %12"FMTu64"\n",
+		    huge_allocated, huge_nmalloc, huge_ndalloc, huge_nrequests);
+		malloc_cprintf(write_cb, cbopaque,
+		    "total:                   %12zu %12"FMTu64" %12"FMTu64
+		    " %12"FMTu64"\n",
+		    small_allocated + large_allocated + huge_allocated,
+		    small_nmalloc + large_nmalloc + huge_nmalloc,
+		    small_ndalloc + large_ndalloc + huge_ndalloc,
+		    small_nrequests + large_nrequests + huge_nrequests);
+	}
+	if (!json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "active:                  %12zu\n", pactive * page);
+	}
+
 	CTL_M2_GET("stats.arenas.0.mapped", i, &mapped, size_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "mapped:                  %12zu\n", mapped);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"mapped\": %zu,\n", mapped);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "mapped:                  %12zu\n", mapped);
+	}
+
 	CTL_M2_GET("stats.arenas.0.retained", i, &retained, size_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "retained:                %12zu\n", retained);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"retained\": %zu,\n", retained);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "retained:                %12zu\n", retained);
+	}
+
 	CTL_M2_GET("stats.arenas.0.metadata.mapped", i, &metadata_mapped,
 	    size_t);
 	CTL_M2_GET("stats.arenas.0.metadata.allocated", i, &metadata_allocated,
 	    size_t);
-	malloc_cprintf(write_cb, cbopaque,
-	    "metadata: mapped: %zu, allocated: %zu\n",
-	    metadata_mapped, metadata_allocated);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\"metadata\": {\n");
 
-	if (bins)
-		stats_arena_bins_print(write_cb, cbopaque, i);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"mapped\": %zu,\n", metadata_mapped);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t\t\"allocated\": %zu\n", metadata_allocated);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\t},\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "metadata: mapped: %zu, allocated: %zu\n",
+		    metadata_mapped, metadata_allocated);
+	}
+
+	if (bins) {
+		stats_arena_bins_print(write_cb, cbopaque, json, large, huge,
+		    i);
+	}
 	if (large)
-		stats_arena_lruns_print(write_cb, cbopaque, i);
+		stats_arena_lruns_print(write_cb, cbopaque, json, huge, i);
 	if (huge)
-		stats_arena_hchunks_print(write_cb, cbopaque, i);
+		stats_arena_hchunks_print(write_cb, cbopaque, json, i);
+}
+
+static void
+stats_general_print(void (*write_cb)(void *, const char *), void *cbopaque,
+    bool json, bool merged, bool unmerged)
+{
+	const char *cpv;
+	bool bv;
+	unsigned uv;
+	uint32_t u32v;
+	uint64_t u64v;
+	ssize_t ssv;
+	size_t sv, bsz, usz, ssz, sssz, cpsz;
+
+	bsz = sizeof(bool);
+	usz = sizeof(unsigned);
+	ssz = sizeof(size_t);
+	sssz = sizeof(ssize_t);
+	cpsz = sizeof(const char *);
+
+	CTL_GET("version", &cpv, const char *);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		"\t\t\"version\": \"%s\",\n", cpv);
+	} else
+		malloc_cprintf(write_cb, cbopaque, "Version: %s\n", cpv);
+
+	/* config. */
+#define	CONFIG_WRITE_BOOL_JSON(n, c)					\
+	if (json) {							\
+		CTL_GET("config."#n, &bv, bool);			\
+		malloc_cprintf(write_cb, cbopaque,			\
+		    "\t\t\t\""#n"\": %s%s\n", bv ? "true" : "false",	\
+		    (c));						\
+	}
+
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\"config\": {\n");
+	}
+
+	CONFIG_WRITE_BOOL_JSON(cache_oblivious, ",")
+
+	CTL_GET("config.debug", &bv, bool);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"debug\": %s,\n", bv ? "true" : "false");
+	} else {
+		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
+		    bv ? "enabled" : "disabled");
+	}
+
+	CONFIG_WRITE_BOOL_JSON(fill, ",")
+	CONFIG_WRITE_BOOL_JSON(lazy_lock, ",")
+
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"malloc_conf\": \"%s\",\n",
+		    config_malloc_conf);
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "config.malloc_conf: \"%s\"\n", config_malloc_conf);
+	}
+
+	CONFIG_WRITE_BOOL_JSON(munmap, ",")
+	CONFIG_WRITE_BOOL_JSON(prof, ",")
+	CONFIG_WRITE_BOOL_JSON(prof_libgcc, ",")
+	CONFIG_WRITE_BOOL_JSON(prof_libunwind, ",")
+	CONFIG_WRITE_BOOL_JSON(stats, ",")
+	CONFIG_WRITE_BOOL_JSON(tcache, ",")
+	CONFIG_WRITE_BOOL_JSON(tls, ",")
+	CONFIG_WRITE_BOOL_JSON(utrace, ",")
+	CONFIG_WRITE_BOOL_JSON(valgrind, ",")
+	CONFIG_WRITE_BOOL_JSON(xmalloc, "")
+
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t},\n");
+	}
+#undef CONFIG_WRITE_BOOL_JSON
+
+	/* opt. */
+#define	OPT_WRITE_BOOL(n, c)						\
+	if (je_mallctl("opt."#n, (void *)&bv, &bsz, NULL, 0) == 0) {	\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %s%s\n", bv ? "true" :	\
+			    "false", (c));				\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %s\n", bv ? "true" : "false");	\
+		}							\
+	}
+#define	OPT_WRITE_BOOL_MUTABLE(n, m, c) {				\
+	bool bv2;							\
+	if (je_mallctl("opt."#n, (void *)&bv, &bsz, NULL, 0) == 0 &&	\
+	    je_mallctl(#m, &bv2, &bsz, NULL, 0) == 0) {			\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %s%s\n", bv ? "true" :	\
+			    "false", (c));				\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
+			    : "false", bv2 ? "true" : "false");		\
+		}							\
+	}								\
+}
+#define	OPT_WRITE_UNSIGNED(n, c)					\
+	if (je_mallctl("opt."#n, (void *)&uv, &usz, NULL, 0) == 0) {	\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %u%s\n", uv, (c));		\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			"  opt."#n": %u\n", uv);			\
+		}							\
+	}
+#define	OPT_WRITE_SIZE_T(n, c)						\
+	if (je_mallctl("opt."#n, (void *)&sv, &ssz, NULL, 0) == 0) {	\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %zu%s\n", sv, (c));	\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			"  opt."#n": %zu\n", sv);			\
+		}							\
+	}
+#define	OPT_WRITE_SSIZE_T(n, c)						\
+	if (je_mallctl("opt."#n, (void *)&ssv, &sssz, NULL, 0) == 0) {	\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %zd%s\n", ssv, (c));	\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %zd\n", ssv);			\
+		}							\
+	}
+#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m, c) {				\
+	ssize_t ssv2;							\
+	if (je_mallctl("opt."#n, (void *)&ssv, &sssz, NULL, 0) == 0 &&	\
+	    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {		\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": %zd%s\n", ssv, (c));	\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": %zd ("#m": %zd)\n",		\
+			    ssv, ssv2);					\
+		}							\
+	}								\
+}
+#define	OPT_WRITE_CHAR_P(n, c)						\
+	if (je_mallctl("opt."#n, (void *)&cpv, &cpsz, NULL, 0) == 0) {	\
+		if (json) {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "\t\t\t\""#n"\": \"%s\"%s\n", cpv, (c));	\
+		} else {						\
+			malloc_cprintf(write_cb, cbopaque,		\
+			    "  opt."#n": \"%s\"\n", cpv);		\
+		}							\
+	}
+
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\"opt\": {\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "Run-time option settings:\n");
+	}
+	OPT_WRITE_BOOL(abort, ",")
+	OPT_WRITE_SIZE_T(lg_chunk, ",")
+	OPT_WRITE_CHAR_P(dss, ",")
+	OPT_WRITE_UNSIGNED(narenas, ",")
+	OPT_WRITE_CHAR_P(purge, ",")
+	if (json || opt_purge == purge_mode_ratio) {
+		OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult,
+		    arenas.lg_dirty_mult, ",")
+	}
+	if (json || opt_purge == purge_mode_decay) {
+		OPT_WRITE_SSIZE_T_MUTABLE(decay_time, arenas.decay_time, ",")
+	}
+	OPT_WRITE_CHAR_P(junk, ",")
+	OPT_WRITE_SIZE_T(quarantine, ",")
+	OPT_WRITE_BOOL(redzone, ",")
+	OPT_WRITE_BOOL(zero, ",")
+	OPT_WRITE_BOOL(utrace, ",")
+	OPT_WRITE_BOOL(xmalloc, ",")
+	OPT_WRITE_BOOL(tcache, ",")
+	OPT_WRITE_SSIZE_T(lg_tcache_max, ",")
+	OPT_WRITE_BOOL(prof, ",")
+	OPT_WRITE_CHAR_P(prof_prefix, ",")
+	OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active, ",")
+	OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init, prof.thread_active_init,
+	    ",")
+	OPT_WRITE_SSIZE_T_MUTABLE(lg_prof_sample, prof.lg_sample, ",")
+	OPT_WRITE_BOOL(prof_accum, ",")
+	OPT_WRITE_SSIZE_T(lg_prof_interval, ",")
+	OPT_WRITE_BOOL(prof_gdump, ",")
+	OPT_WRITE_BOOL(prof_final, ",")
+	OPT_WRITE_BOOL(prof_leak, ",")
+	/*
+	 * stats_print is always emitted, so as long as stats_print comes last
+	 * it's safe to unconditionally omit the comma here (rather than having
+	 * to conditionally omit it elsewhere depending on configuration).
+	 */
+	OPT_WRITE_BOOL(stats_print, "")
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t},\n");
+	}
+
+#undef OPT_WRITE_BOOL
+#undef OPT_WRITE_BOOL_MUTABLE
+#undef OPT_WRITE_SIZE_T
+#undef OPT_WRITE_SSIZE_T
+#undef OPT_WRITE_CHAR_P
+
+	/* arenas. */
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\"arenas\": {\n");
+	}
+
+	CTL_GET("arenas.narenas", &uv, unsigned);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"narenas\": %u,\n", uv);
+	} else
+		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
+
+	CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"lg_dirty_mult\": %zd,\n", ssv);
+	} else if (opt_purge == purge_mode_ratio) {
+		if (ssv >= 0) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "Min active:dirty page ratio per arena: "
+			    "%u:1\n", (1U << ssv));
+		} else {
+			malloc_cprintf(write_cb, cbopaque,
+			    "Min active:dirty page ratio per arena: "
+			    "N/A\n");
+		}
+	}
+	CTL_GET("arenas.decay_time", &ssv, ssize_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"decay_time\": %zd,\n", ssv);
+	} else if (opt_purge == purge_mode_decay) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "Unused dirty page decay time: %zd%s\n",
+		    ssv, (ssv < 0) ? " (no decay)" : "");
+	}
+
+	CTL_GET("arenas.quantum", &sv, size_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"quantum\": %zu,\n", sv);
+	} else
+		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n", sv);
+
+	CTL_GET("arenas.page", &sv, size_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"page\": %zu,\n", sv);
+	} else
+		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
+
+	if (je_mallctl("arenas.tcache_max", (void *)&sv, &ssz, NULL, 0) == 0) {
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\"tcache_max\": %zu,\n", sv);
+		} else {
+			malloc_cprintf(write_cb, cbopaque,
+			    "Maximum thread-cached size class: %zu\n", sv);
+		}
+	}
+
+	if (json) {
+		unsigned nbins, nlruns, nhchunks, i;
+
+		CTL_GET("arenas.nbins", &nbins, unsigned);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"nbins\": %u,\n", nbins);
+
+		CTL_GET("arenas.nhbins", &uv, unsigned);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"nhbins\": %u,\n", uv);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"bin\": [\n");
+		for (i = 0; i < nbins; i++) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t{\n");
+
+			CTL_M2_GET("arenas.bin.0.size", i, &sv, size_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\"size\": %zu,\n", sv);
+
+			CTL_M2_GET("arenas.bin.0.nregs", i, &u32v, uint32_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\"nregs\": %"FMTu32",\n", u32v);
+
+			CTL_M2_GET("arenas.bin.0.run_size", i, &sv, size_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\"run_size\": %zu\n", sv);
+
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t}%s\n", (i + 1 < nbins) ? "," : "");
+		}
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t],\n");
+
+		CTL_GET("arenas.nlruns", &nlruns, unsigned);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"nlruns\": %u,\n", nlruns);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"lrun\": [\n");
+		for (i = 0; i < nlruns; i++) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t{\n");
+
+			CTL_M2_GET("arenas.lrun.0.size", i, &sv, size_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\"size\": %zu\n", sv);
+
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t}%s\n", (i + 1 < nlruns) ? "," : "");
+		}
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t],\n");
+
+		CTL_GET("arenas.nhchunks", &nhchunks, unsigned);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"nhchunks\": %u,\n", nhchunks);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"hchunk\": [\n");
+		for (i = 0; i < nhchunks; i++) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t{\n");
+
+			CTL_M2_GET("arenas.hchunk.0.size", i, &sv, size_t);
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t\t\"size\": %zu\n", sv);
+
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\t\t}%s\n", (i + 1 < nhchunks) ? "," : "");
+		}
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t]\n");
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t},\n");
+	}
+
+	/* prof. */
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\"prof\": {\n");
+
+		CTL_GET("prof.thread_active_init", &bv, bool);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"thread_active_init\": %s,\n", bv ? "true" :
+		    "false");
+
+		CTL_GET("prof.active", &bv, bool);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"active\": %s,\n", bv ? "true" : "false");
+
+		CTL_GET("prof.gdump", &bv, bool);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"gdump\": %s,\n", bv ? "true" : "false");
+
+		CTL_GET("prof.interval", &u64v, uint64_t);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"interval\": %"FMTu64",\n", u64v);
+
+		CTL_GET("prof.lg_sample", &ssv, ssize_t);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"lg_sample\": %zd\n", ssv);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t}%s\n", (config_stats || merged || unmerged) ? "," :
+		    "");
+	}
+}
+
+static void
+stats_print_helper(void (*write_cb)(void *, const char *), void *cbopaque,
+    bool json, bool merged, bool unmerged, bool bins, bool large, bool huge)
+{
+	size_t *cactive;
+	size_t allocated, active, metadata, resident, mapped, retained;
+
+	CTL_GET("stats.cactive", &cactive, size_t *);
+	CTL_GET("stats.allocated", &allocated, size_t);
+	CTL_GET("stats.active", &active, size_t);
+	CTL_GET("stats.metadata", &metadata, size_t);
+	CTL_GET("stats.resident", &resident, size_t);
+	CTL_GET("stats.mapped", &mapped, size_t);
+	CTL_GET("stats.retained", &retained, size_t);
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\"stats\": {\n");
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"cactive\": %zu,\n", atomic_read_z(cactive));
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"allocated\": %zu,\n", allocated);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"active\": %zu,\n", active);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"metadata\": %zu,\n", metadata);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"resident\": %zu,\n", resident);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"mapped\": %zu,\n", mapped);
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t\t\"retained\": %zu\n", retained);
+
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t\t}%s\n", (merged || unmerged) ? "," : "");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "Allocated: %zu, active: %zu, metadata: %zu,"
+		    " resident: %zu, mapped: %zu, retained: %zu\n",
+		    allocated, active, metadata, resident, mapped, retained);
+		malloc_cprintf(write_cb, cbopaque,
+		    "Current active ceiling: %zu\n",
+		    atomic_read_z(cactive));
+	}
+
+	if (merged || unmerged) {
+		unsigned narenas;
+
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t\"stats.arenas\": {\n");
+		}
+
+		CTL_GET("arenas.narenas", &narenas, unsigned);
+		{
+			VARIABLE_ARRAY(bool, initialized, narenas);
+			size_t isz;
+			unsigned i, j, ninitialized;
+
+			isz = sizeof(bool) * narenas;
+			xmallctl("arenas.initialized", (void *)initialized,
+			    &isz, NULL, 0);
+			for (i = ninitialized = 0; i < narenas; i++) {
+				if (initialized[i])
+					ninitialized++;
+			}
+
+			/* Merged stats. */
+			if (merged && (ninitialized > 1 || !unmerged)) {
+				/* Print merged arena stats. */
+				if (json) {
+					malloc_cprintf(write_cb, cbopaque,
+					    "\t\t\t\"merged\": {\n");
+				} else {
+					malloc_cprintf(write_cb, cbopaque,
+					    "\nMerged arenas stats:\n");
+				}
+				stats_arena_print(write_cb, cbopaque, json,
+				    narenas, bins, large, huge);
+				if (json) {
+					malloc_cprintf(write_cb, cbopaque,
+					    "\t\t\t}%s\n", (ninitialized > 1) ?
+					    "," : "");
+				}
+			}
+
+			/* Unmerged stats. */
+			for (i = j = 0; i < narenas; i++) {
+				if (initialized[i]) {
+					if (json) {
+						j++;
+						malloc_cprintf(write_cb,
+						    cbopaque,
+						    "\t\t\t\"%u\": {\n", i);
+					} else {
+						malloc_cprintf(write_cb,
+						    cbopaque, "\narenas[%u]:\n",
+						    i);
+					}
+					stats_arena_print(write_cb, cbopaque,
+					    json, i, bins, large, huge);
+					if (json) {
+						malloc_cprintf(write_cb,
+						    cbopaque,
+						    "\t\t\t}%s\n", (j <
+						    ninitialized) ? "," : "");
+					}
+				}
+			}
+		}
+
+		if (json) {
+			malloc_cprintf(write_cb, cbopaque,
+			    "\t\t}\n");
+		}
+	}
 }
 
 void
@@ -375,6 +1067,7 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 	int err;
 	uint64_t epoch;
 	size_t u64sz;
+	bool json = false;
 	bool general = true;
 	bool merged = true;
 	bool unmerged = true;
@@ -408,6 +1101,9 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 
 		for (i = 0; opts[i] != '\0'; i++) {
 			switch (opts[i]) {
+			case 'J':
+				json = true;
+				break;
 			case 'g':
 				general = false;
 				break;
@@ -431,246 +1127,27 @@ stats_print(void (*write_cb)(void *, const char *), void *cbopaque,
 		}
 	}
 
-	malloc_cprintf(write_cb, cbopaque,
-	    "___ Begin jemalloc statistics ___\n");
-	if (general) {
-		const char *cpv;
-		bool bv;
-		unsigned uv;
-		ssize_t ssv;
-		size_t sv, bsz, usz, ssz, sssz, cpsz;
-
-		bsz = sizeof(bool);
-		usz = sizeof(unsigned);
-		ssz = sizeof(size_t);
-		sssz = sizeof(ssize_t);
-		cpsz = sizeof(const char *);
-
-		CTL_GET("version", &cpv, const char *);
-		malloc_cprintf(write_cb, cbopaque, "Version: %s\n", cpv);
-		CTL_GET("config.debug", &bv, bool);
-		malloc_cprintf(write_cb, cbopaque, "Assertions %s\n",
-		    bv ? "enabled" : "disabled");
+	if (json) {
 		malloc_cprintf(write_cb, cbopaque,
-		    "config.malloc_conf: \"%s\"\n", config_malloc_conf);
-
-#define	OPT_WRITE_BOOL(n)						\
-		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %s\n", bv ? "true" : "false");	\
-		}
-#define	OPT_WRITE_BOOL_MUTABLE(n, m) {					\
-		bool bv2;						\
-		if (je_mallctl("opt."#n, &bv, &bsz, NULL, 0) == 0 &&	\
-		    je_mallctl(#m, &bv2, &bsz, NULL, 0) == 0) {		\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %s ("#m": %s)\n", bv ? "true"	\
-			    : "false", bv2 ? "true" : "false");		\
-		}							\
-}
-#define	OPT_WRITE_UNSIGNED(n)						\
-		if (je_mallctl("opt."#n, &uv, &usz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			"  opt."#n": %u\n", uv);			\
-		}
-#define	OPT_WRITE_SIZE_T(n)						\
-		if (je_mallctl("opt."#n, &sv, &ssz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			"  opt."#n": %zu\n", sv);			\
-		}
-#define	OPT_WRITE_SSIZE_T(n)						\
-		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd\n", ssv);			\
-		}
-#define	OPT_WRITE_SSIZE_T_MUTABLE(n, m) {				\
-		ssize_t ssv2;						\
-		if (je_mallctl("opt."#n, &ssv, &sssz, NULL, 0) == 0 &&	\
-		    je_mallctl(#m, &ssv2, &sssz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": %zd ("#m": %zd)\n",		\
-			    ssv, ssv2);					\
-		}							\
-}
-#define	OPT_WRITE_CHAR_P(n)						\
-		if (je_mallctl("opt."#n, &cpv, &cpsz, NULL, 0) == 0) {	\
-			malloc_cprintf(write_cb, cbopaque,		\
-			    "  opt."#n": \"%s\"\n", cpv);		\
-		}
-
+		    "{\n"
+		    "\t\"jemalloc\": {\n");
+	} else {
 		malloc_cprintf(write_cb, cbopaque,
-		    "Run-time option settings:\n");
-		OPT_WRITE_BOOL(abort)
-		OPT_WRITE_SIZE_T(lg_chunk)
-		OPT_WRITE_CHAR_P(dss)
-		OPT_WRITE_UNSIGNED(narenas)
-		OPT_WRITE_CHAR_P(purge)
-		if (opt_purge == purge_mode_ratio) {
-			OPT_WRITE_SSIZE_T_MUTABLE(lg_dirty_mult,
-			    arenas.lg_dirty_mult)
-		}
-		if (opt_purge == purge_mode_decay)
-			OPT_WRITE_SSIZE_T_MUTABLE(decay_time, arenas.decay_time)
-		OPT_WRITE_BOOL(stats_print)
-		OPT_WRITE_CHAR_P(junk)
-		OPT_WRITE_SIZE_T(quarantine)
-		OPT_WRITE_BOOL(redzone)
-		OPT_WRITE_BOOL(zero)
-		OPT_WRITE_BOOL(utrace)
-		OPT_WRITE_BOOL(valgrind)
-		OPT_WRITE_BOOL(xmalloc)
-		OPT_WRITE_BOOL(tcache)
-		OPT_WRITE_SSIZE_T(lg_tcache_max)
-		OPT_WRITE_BOOL(prof)
-		OPT_WRITE_CHAR_P(prof_prefix)
-		OPT_WRITE_BOOL_MUTABLE(prof_active, prof.active)
-		OPT_WRITE_BOOL_MUTABLE(prof_thread_active_init,
-		    prof.thread_active_init)
-		OPT_WRITE_SSIZE_T(lg_prof_sample)
-		OPT_WRITE_BOOL(prof_accum)
-		OPT_WRITE_SSIZE_T(lg_prof_interval)
-		OPT_WRITE_BOOL(prof_gdump)
-		OPT_WRITE_BOOL(prof_final)
-		OPT_WRITE_BOOL(prof_leak)
-
-#undef OPT_WRITE_BOOL
-#undef OPT_WRITE_BOOL_MUTABLE
-#undef OPT_WRITE_SIZE_T
-#undef OPT_WRITE_SSIZE_T
-#undef OPT_WRITE_CHAR_P
-
-		malloc_cprintf(write_cb, cbopaque, "CPUs: %u\n", ncpus);
-
-		CTL_GET("arenas.narenas", &uv, unsigned);
-		malloc_cprintf(write_cb, cbopaque, "Arenas: %u\n", uv);
-
-		malloc_cprintf(write_cb, cbopaque, "Pointer size: %zu\n",
-		    sizeof(void *));
-
-		CTL_GET("arenas.quantum", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Quantum size: %zu\n",
-		    sv);
-
-		CTL_GET("arenas.page", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque, "Page size: %zu\n", sv);
-
-		CTL_GET("arenas.lg_dirty_mult", &ssv, ssize_t);
-		if (opt_purge == purge_mode_ratio) {
-			if (ssv >= 0) {
-				malloc_cprintf(write_cb, cbopaque,
-				    "Min active:dirty page ratio per arena: "
-				    "%u:1\n", (1U << ssv));
-			} else {
-				malloc_cprintf(write_cb, cbopaque,
-				    "Min active:dirty page ratio per arena: "
-				    "N/A\n");
-			}
-		}
-		CTL_GET("arenas.decay_time", &ssv, ssize_t);
-		if (opt_purge == purge_mode_decay) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "Unused dirty page decay time: %zd%s\n",
-			    ssv, (ssv < 0) ? " (no decay)" : "");
-		}
-		if (je_mallctl("arenas.tcache_max", &sv, &ssz, NULL, 0) == 0) {
-			malloc_cprintf(write_cb, cbopaque,
-			    "Maximum thread-cached size class: %zu\n", sv);
-		}
-		if (je_mallctl("opt.prof", &bv, &bsz, NULL, 0) == 0 && bv) {
-			CTL_GET("prof.lg_sample", &sv, size_t);
-			malloc_cprintf(write_cb, cbopaque,
-			    "Average profile sample interval: %"FMTu64
-			    " (2^%zu)\n", (((uint64_t)1U) << sv), sv);
-
-			CTL_GET("opt.lg_prof_interval", &ssv, ssize_t);
-			if (ssv >= 0) {
-				malloc_cprintf(write_cb, cbopaque,
-				    "Average profile dump interval: %"FMTu64
-				    " (2^%zd)\n",
-				    (((uint64_t)1U) << ssv), ssv);
-			} else {
-				malloc_cprintf(write_cb, cbopaque,
-				    "Average profile dump interval: N/A\n");
-			}
-		}
-		CTL_GET("opt.lg_chunk", &sv, size_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "Chunk size: %zu (2^%zu)\n", (ZU(1) << sv), sv);
+		    "___ Begin jemalloc statistics ___\n");
 	}
 
+	if (general)
+		stats_general_print(write_cb, cbopaque, json, merged, unmerged);
 	if (config_stats) {
-		size_t *cactive;
-		size_t allocated, active, metadata, resident, mapped, retained;
-
-		CTL_GET("stats.cactive", &cactive, size_t *);
-		CTL_GET("stats.allocated", &allocated, size_t);
-		CTL_GET("stats.active", &active, size_t);
-		CTL_GET("stats.metadata", &metadata, size_t);
-		CTL_GET("stats.resident", &resident, size_t);
-		CTL_GET("stats.mapped", &mapped, size_t);
-		CTL_GET("stats.retained", &retained, size_t);
-		malloc_cprintf(write_cb, cbopaque,
-		    "Allocated: %zu, active: %zu, metadata: %zu,"
-		    " resident: %zu, mapped: %zu, retained: %zu\n",
-		    allocated, active, metadata, resident, mapped, retained);
-		malloc_cprintf(write_cb, cbopaque,
-		    "Current active ceiling: %zu\n",
-		    atomic_read_z(cactive));
-
-		if (merged) {
-			unsigned narenas;
-
-			CTL_GET("arenas.narenas", &narenas, unsigned);
-			{
-				VARIABLE_ARRAY(bool, initialized, narenas);
-				size_t isz;
-				unsigned i, ninitialized;
-
-				isz = sizeof(bool) * narenas;
-				xmallctl("arenas.initialized", initialized,
-				    &isz, NULL, 0);
-				for (i = ninitialized = 0; i < narenas; i++) {
-					if (initialized[i])
-						ninitialized++;
-				}
-
-				if (ninitialized > 1 || !unmerged) {
-					/* Print merged arena stats. */
-					malloc_cprintf(write_cb, cbopaque,
-					    "\nMerged arenas stats:\n");
-					stats_arena_print(write_cb, cbopaque,
-					    narenas, bins, large, huge);
-				}
-			}
-		}
-
-		if (unmerged) {
-			unsigned narenas;
-
-			/* Print stats for each arena. */
-
-			CTL_GET("arenas.narenas", &narenas, unsigned);
-			{
-				VARIABLE_ARRAY(bool, initialized, narenas);
-				size_t isz;
-				unsigned i;
-
-				isz = sizeof(bool) * narenas;
-				xmallctl("arenas.initialized", initialized,
-				    &isz, NULL, 0);
-
-				for (i = 0; i < narenas; i++) {
-					if (initialized[i]) {
-						malloc_cprintf(write_cb,
-						    cbopaque,
-						    "\narenas[%u]:\n", i);
-						stats_arena_print(write_cb,
-						    cbopaque, i, bins, large,
-						    huge);
-					}
-				}
-			}
-		}
+		stats_print_helper(write_cb, cbopaque, json, merged, unmerged,
+		    bins, large, huge);
+	}
+	if (json) {
+		malloc_cprintf(write_cb, cbopaque,
+		    "\t}\n"
+		    "}\n");
+	} else {
+		malloc_cprintf(write_cb, cbopaque,
+		    "--- End jemalloc statistics ---\n");
 	}
-	malloc_cprintf(write_cb, cbopaque, "--- End jemalloc statistics ---\n");
 }
diff --git a/src/tcache.c b/src/tcache.c
index 175759c7..f97aa420 100644
--- a/src/tcache.c
+++ b/src/tcache.c
@@ -445,14 +445,14 @@ tcache_stats_merge(tsdn_t *tsdn, tcache_t *tcache, arena_t *arena)
 }
 
 bool
-tcaches_create(tsdn_t *tsdn, unsigned *r_ind)
+tcaches_create(tsd_t *tsd, unsigned *r_ind)
 {
 	arena_t *arena;
 	tcache_t *tcache;
 	tcaches_t *elm;
 
 	if (tcaches == NULL) {
-		tcaches = base_alloc(tsdn, sizeof(tcache_t *) *
+		tcaches = base_alloc(tsd_tsdn(tsd), sizeof(tcache_t *) *
 		    (MALLOCX_TCACHE_MAX+1));
 		if (tcaches == NULL)
 			return (true);
@@ -460,10 +460,10 @@ tcaches_create(tsdn_t *tsdn, unsigned *r_ind)
 
 	if (tcaches_avail == NULL && tcaches_past > MALLOCX_TCACHE_MAX)
 		return (true);
-	arena = arena_ichoose(tsdn, NULL);
+	arena = arena_ichoose(tsd, NULL);
 	if (unlikely(arena == NULL))
 		return (true);
-	tcache = tcache_create(tsdn, arena);
+	tcache = tcache_create(tsd_tsdn(tsd), arena);
 	if (tcache == NULL)
 		return (true);
 
diff --git a/src/tsd.c b/src/tsd.c
index aeaa5e18..ec69a51c 100644
--- a/src/tsd.c
+++ b/src/tsd.c
@@ -171,10 +171,10 @@ tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block)
 	tsd_init_block_t *iter;
 
 	/* Check whether this thread has already inserted into the list. */
-	malloc_mutex_lock(NULL, &head->lock);
+	malloc_mutex_lock(TSDN_NULL, &head->lock);
 	ql_foreach(iter, &head->blocks, link) {
 		if (iter->thread == self) {
-			malloc_mutex_unlock(NULL, &head->lock);
+			malloc_mutex_unlock(TSDN_NULL, &head->lock);
 			return (iter->data);
 		}
 	}
@@ -182,7 +182,7 @@ tsd_init_check_recursion(tsd_init_head_t *head, tsd_init_block_t *block)
 	ql_elm_new(block, link);
 	block->thread = self;
 	ql_tail_insert(&head->blocks, block, link);
-	malloc_mutex_unlock(NULL, &head->lock);
+	malloc_mutex_unlock(TSDN_NULL, &head->lock);
 	return (NULL);
 }
 
@@ -190,8 +190,8 @@ void
 tsd_init_finish(tsd_init_head_t *head, tsd_init_block_t *block)
 {
 
-	malloc_mutex_lock(NULL, &head->lock);
+	malloc_mutex_lock(TSDN_NULL, &head->lock);
 	ql_remove(&head->blocks, block, link);
-	malloc_mutex_unlock(NULL, &head->lock);
+	malloc_mutex_unlock(TSDN_NULL, &head->lock);
 }
 #endif
diff --git a/src/util.c b/src/util.c
index a1c4a2a4..79052674 100644
--- a/src/util.c
+++ b/src/util.c
@@ -49,7 +49,7 @@ static void
 wrtmessage(void *cbopaque, const char *s)
 {
 
-#ifdef SYS_write
+#if defined(JEMALLOC_HAVE_SYSCALL) && defined(SYS_write)
 	/*
 	 * Use syscall(2) rather than write(2) when possible in order to avoid
 	 * the possibility of memory allocation within libc.  This is necessary
diff --git a/src/zone.c b/src/zone.c
index 2c17123a..0571920e 100644
--- a/src/zone.c
+++ b/src/zone.c
@@ -4,7 +4,7 @@
 #endif
 
 /*
- * The malloc_default_purgeable_zone function is only available on >= 10.6.
+ * The malloc_default_purgeable_zone() function is only available on >= 10.6.
  * We need to check whether it is present at runtime, thus the weak_import.
  */
 extern malloc_zone_t *malloc_default_purgeable_zone(void)
@@ -13,8 +13,9 @@ JEMALLOC_ATTR(weak_import);
 /******************************************************************************/
 /* Data. */
 
-static malloc_zone_t zone;
-static struct malloc_introspection_t zone_introspect;
+static malloc_zone_t *default_zone, *purgeable_zone;
+static malloc_zone_t jemalloc_zone;
+static struct malloc_introspection_t jemalloc_zone_introspect;
 
 /******************************************************************************/
 /* Function prototypes for non-inline static functions. */
@@ -164,89 +165,103 @@ static void
 zone_force_unlock(malloc_zone_t *zone)
 {
 
+	/*
+	 * Call jemalloc_postfork_child() rather than
+	 * jemalloc_postfork_parent(), because this function is executed by both
+	 * parent and child.  The parent can tolerate having state
+	 * reinitialized, but the child cannot unlock mutexes that were locked
+	 * by the parent.
+	 */
 	if (isthreaded)
-		jemalloc_postfork_parent();
+		jemalloc_postfork_child();
 }
 
-JEMALLOC_ATTR(constructor)
-void
-register_zone(void)
+static void
+zone_init(void)
 {
 
-	/*
-	 * If something else replaced the system default zone allocator, don't
-	 * register jemalloc's.
-	 */
-	malloc_zone_t *default_zone = malloc_default_zone();
-	malloc_zone_t *purgeable_zone = NULL;
-	if (!default_zone->zone_name ||
-	    strcmp(default_zone->zone_name, "DefaultMallocZone") != 0) {
-		return;
-	}
-
-	zone.size = (void *)zone_size;
-	zone.malloc = (void *)zone_malloc;
-	zone.calloc = (void *)zone_calloc;
-	zone.valloc = (void *)zone_valloc;
-	zone.free = (void *)zone_free;
-	zone.realloc = (void *)zone_realloc;
-	zone.destroy = (void *)zone_destroy;
-	zone.zone_name = "jemalloc_zone";
-	zone.batch_malloc = NULL;
-	zone.batch_free = NULL;
-	zone.introspect = &zone_introspect;
-	zone.version = JEMALLOC_ZONE_VERSION;
+	jemalloc_zone.size = (void *)zone_size;
+	jemalloc_zone.malloc = (void *)zone_malloc;
+	jemalloc_zone.calloc = (void *)zone_calloc;
+	jemalloc_zone.valloc = (void *)zone_valloc;
+	jemalloc_zone.free = (void *)zone_free;
+	jemalloc_zone.realloc = (void *)zone_realloc;
+	jemalloc_zone.destroy = (void *)zone_destroy;
+	jemalloc_zone.zone_name = "jemalloc_zone";
+	jemalloc_zone.batch_malloc = NULL;
+	jemalloc_zone.batch_free = NULL;
+	jemalloc_zone.introspect = &jemalloc_zone_introspect;
+	jemalloc_zone.version = JEMALLOC_ZONE_VERSION;
 #if (JEMALLOC_ZONE_VERSION >= 5)
-	zone.memalign = zone_memalign;
+	jemalloc_zone.memalign = zone_memalign;
 #endif
 #if (JEMALLOC_ZONE_VERSION >= 6)
-	zone.free_definite_size = zone_free_definite_size;
+	jemalloc_zone.free_definite_size = zone_free_definite_size;
 #endif
 #if (JEMALLOC_ZONE_VERSION >= 8)
-	zone.pressure_relief = NULL;
+	jemalloc_zone.pressure_relief = NULL;
 #endif
 
-	zone_introspect.enumerator = NULL;
-	zone_introspect.good_size = (void *)zone_good_size;
-	zone_introspect.check = NULL;
-	zone_introspect.print = NULL;
-	zone_introspect.log = NULL;
-	zone_introspect.force_lock = (void *)zone_force_lock;
-	zone_introspect.force_unlock = (void *)zone_force_unlock;
-	zone_introspect.statistics = NULL;
+	jemalloc_zone_introspect.enumerator = NULL;
+	jemalloc_zone_introspect.good_size = (void *)zone_good_size;
+	jemalloc_zone_introspect.check = NULL;
+	jemalloc_zone_introspect.print = NULL;
+	jemalloc_zone_introspect.log = NULL;
+	jemalloc_zone_introspect.force_lock = (void *)zone_force_lock;
+	jemalloc_zone_introspect.force_unlock = (void *)zone_force_unlock;
+	jemalloc_zone_introspect.statistics = NULL;
 #if (JEMALLOC_ZONE_VERSION >= 6)
-	zone_introspect.zone_locked = NULL;
+	jemalloc_zone_introspect.zone_locked = NULL;
 #endif
 #if (JEMALLOC_ZONE_VERSION >= 7)
-	zone_introspect.enable_discharge_checking = NULL;
-	zone_introspect.disable_discharge_checking = NULL;
-	zone_introspect.discharge = NULL;
-#ifdef __BLOCKS__
-	zone_introspect.enumerate_discharged_pointers = NULL;
-#else
-	zone_introspect.enumerate_unavailable_without_blocks = NULL;
-#endif
+	jemalloc_zone_introspect.enable_discharge_checking = NULL;
+	jemalloc_zone_introspect.disable_discharge_checking = NULL;
+	jemalloc_zone_introspect.discharge = NULL;
+#  ifdef __BLOCKS__
+	jemalloc_zone_introspect.enumerate_discharged_pointers = NULL;
+#  else
+	jemalloc_zone_introspect.enumerate_unavailable_without_blocks = NULL;
+#  endif
 #endif
+}
+
+static malloc_zone_t *
+zone_default_get(void)
+{
+	malloc_zone_t **zones = NULL;
+	unsigned int num_zones = 0;
 
 	/*
-	 * The default purgeable zone is created lazily by OSX's libc.  It uses
-	 * the default zone when it is created for "small" allocations
-	 * (< 15 KiB), but assumes the default zone is a scalable_zone.  This
-	 * obviously fails when the default zone is the jemalloc zone, so
-	 * malloc_default_purgeable_zone is called beforehand so that the
-	 * default purgeable zone is created when the default zone is still
-	 * a scalable_zone.  As purgeable zones only exist on >= 10.6, we need
-	 * to check for the existence of malloc_default_purgeable_zone() at
-	 * run time.
+	 * On OSX 10.12, malloc_default_zone returns a special zone that is not
+	 * present in the list of registered zones. That zone uses a "lite zone"
+	 * if one is present (apparently enabled when malloc stack logging is
+	 * enabled), or the first registered zone otherwise. In practice this
+	 * means unless malloc stack logging is enabled, the first registered
+	 * zone is the default.  So get the list of zones to get the first one,
+	 * instead of relying on malloc_default_zone.
 	 */
-	if (malloc_default_purgeable_zone != NULL)
-		purgeable_zone = malloc_default_purgeable_zone();
+	if (KERN_SUCCESS != malloc_get_all_zones(0, NULL,
+	    (vm_address_t**)&zones, &num_zones)) {
+		/*
+		 * Reset the value in case the failure happened after it was
+		 * set.
+		 */
+		num_zones = 0;
+	}
 
-	/* Register the custom zone.  At this point it won't be the default. */
-	malloc_zone_register(&zone);
+	if (num_zones)
+		return (zones[0]);
+
+	return (malloc_default_zone());
+}
+
+/* As written, this function can only promote jemalloc_zone. */
+static void
+zone_promote(void)
+{
+	malloc_zone_t *zone;
 
 	do {
-		default_zone = malloc_default_zone();
 		/*
 		 * Unregister and reregister the default zone.  On OSX >= 10.6,
 		 * unregistering takes the last registered zone and places it
@@ -257,6 +272,7 @@ register_zone(void)
 		 */
 		malloc_zone_unregister(default_zone);
 		malloc_zone_register(default_zone);
+
 		/*
 		 * On OSX 10.6, having the default purgeable zone appear before
 		 * the default zone makes some things crash because it thinks it
@@ -268,9 +284,47 @@ register_zone(void)
 		 * above, i.e. the default zone.  Registering it again then puts
 		 * it at the end, obviously after the default zone.
 		 */
-		if (purgeable_zone) {
+		if (purgeable_zone != NULL) {
 			malloc_zone_unregister(purgeable_zone);
 			malloc_zone_register(purgeable_zone);
 		}
-	} while (malloc_default_zone() != &zone);
+
+		zone = zone_default_get();
+	} while (zone != &jemalloc_zone);
+}
+
+JEMALLOC_ATTR(constructor)
+void
+zone_register(void)
+{
+
+	/*
+	 * If something else replaced the system default zone allocator, don't
+	 * register jemalloc's.
+	 */
+	default_zone = zone_default_get();
+	if (!default_zone->zone_name || strcmp(default_zone->zone_name,
+	    "DefaultMallocZone") != 0)
+		return;
+
+	/*
+	 * The default purgeable zone is created lazily by OSX's libc.  It uses
+	 * the default zone when it is created for "small" allocations
+	 * (< 15 KiB), but assumes the default zone is a scalable_zone.  This
+	 * obviously fails when the default zone is the jemalloc zone, so
+	 * malloc_default_purgeable_zone() is called beforehand so that the
+	 * default purgeable zone is created when the default zone is still
+	 * a scalable_zone.  As purgeable zones only exist on >= 10.6, we need
+	 * to check for the existence of malloc_default_purgeable_zone() at
+	 * run time.
+	 */
+	purgeable_zone = (malloc_default_purgeable_zone == NULL) ? NULL :
+	    malloc_default_purgeable_zone();
+
+	/* Register the custom zone.  At this point it won't be the default. */
+	zone_init();
+	malloc_zone_register(&jemalloc_zone);
+
+	/* Promote the custom zone to be default. */
+	zone_promote();
 }
diff --git a/test/include/test/mtx.h b/test/include/test/mtx.h
index bbe822f5..58afbc3d 100644
--- a/test/include/test/mtx.h
+++ b/test/include/test/mtx.h
@@ -8,6 +8,8 @@
 typedef struct {
 #ifdef _WIN32
 	CRITICAL_SECTION	lock;
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	os_unfair_lock		lock;
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLock		lock;
 #else
diff --git a/test/integration/aligned_alloc.c b/test/integration/aligned_alloc.c
index 60900148..58438421 100644
--- a/test/integration/aligned_alloc.c
+++ b/test/integration/aligned_alloc.c
@@ -1,9 +1,20 @@
 #include "test/jemalloc_test.h"
 
 #define	CHUNK 0x400000
-/* #define MAXALIGN ((size_t)UINT64_C(0x80000000000)) */
-#define	MAXALIGN ((size_t)0x2000000LU)
-#define	NITER 4
+#define	MAXALIGN (((size_t)1) << 23)
+
+/*
+ * On systems which can't merge extents, tests that call this function generate
+ * a lot of dirty memory very quickly.  Purging between cycles mitigates
+ * potential OOM on e.g. 32-bit Windows.
+ */
+static void
+purge(void)
+{
+
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl error");
+}
 
 TEST_BEGIN(test_alignment_errors)
 {
@@ -74,6 +85,7 @@ TEST_END
 
 TEST_BEGIN(test_alignment_and_size)
 {
+#define	NITER 4
 	size_t alignment, size, total;
 	unsigned i;
 	void *ps[NITER];
@@ -110,7 +122,9 @@ TEST_BEGIN(test_alignment_and_size)
 				}
 			}
 		}
+		purge();
 	}
+#undef NITER
 }
 TEST_END
 
diff --git a/test/integration/mallocx.c b/test/integration/mallocx.c
index 55e1a090..43b76eba 100644
--- a/test/integration/mallocx.c
+++ b/test/integration/mallocx.c
@@ -50,6 +50,19 @@ get_huge_size(size_t ind)
 	return (get_size_impl("arenas.hchunk.0.size", ind));
 }
 
+/*
+ * On systems which can't merge extents, tests that call this function generate
+ * a lot of dirty memory very quickly.  Purging between cycles mitigates
+ * potential OOM on e.g. 32-bit Windows.
+ */
+static void
+purge(void)
+{
+
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl error");
+}
+
 TEST_BEGIN(test_overflow)
 {
 	size_t hugemax;
@@ -96,6 +109,7 @@ TEST_BEGIN(test_oom)
 		if (ptrs[i] != NULL)
 			dallocx(ptrs[i], 0);
 	}
+	purge();
 
 #if LG_SIZEOF_PTR == 3
 	assert_ptr_null(mallocx(0x8000000000000000ULL,
@@ -113,7 +127,7 @@ TEST_END
 
 TEST_BEGIN(test_basic)
 {
-#define	MAXSZ (((size_t)1) << 26)
+#define	MAXSZ (((size_t)1) << 23)
 	size_t sz;
 
 	for (sz = 1; sz < MAXSZ; sz = nallocx(sz, 0) + 1) {
@@ -122,23 +136,28 @@ TEST_BEGIN(test_basic)
 		nsz = nallocx(sz, 0);
 		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
 		p = mallocx(sz, 0);
-		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		assert_ptr_not_null(p,
+		    "Unexpected mallocx(size=%zx, flags=0) error", sz);
 		rsz = sallocx(p, 0);
 		assert_zu_ge(rsz, sz, "Real size smaller than expected");
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() size mismatch");
 		dallocx(p, 0);
 
 		p = mallocx(sz, 0);
-		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		assert_ptr_not_null(p,
+		    "Unexpected mallocx(size=%zx, flags=0) error", sz);
 		dallocx(p, 0);
 
 		nsz = nallocx(sz, MALLOCX_ZERO);
 		assert_zu_ne(nsz, 0, "Unexpected nallocx() error");
 		p = mallocx(sz, MALLOCX_ZERO);
-		assert_ptr_not_null(p, "Unexpected mallocx() error");
+		assert_ptr_not_null(p,
+		    "Unexpected mallocx(size=%zx, flags=MALLOCX_ZERO) error",
+		    nsz);
 		rsz = sallocx(p, 0);
 		assert_zu_eq(nsz, rsz, "nallocx()/sallocx() rsize mismatch");
 		dallocx(p, 0);
+		purge();
 	}
 #undef MAXSZ
 }
@@ -146,7 +165,7 @@ TEST_END
 
 TEST_BEGIN(test_alignment_and_size)
 {
-#define	MAXALIGN (((size_t)1) << 25)
+#define	MAXALIGN (((size_t)1) << 23)
 #define	NITER 4
 	size_t nsz, rsz, sz, alignment, total;
 	unsigned i;
@@ -196,6 +215,7 @@ TEST_BEGIN(test_alignment_and_size)
 				}
 			}
 		}
+		purge();
 	}
 #undef MAXALIGN
 #undef NITER
diff --git a/test/integration/posix_memalign.c b/test/integration/posix_memalign.c
index 19741c6c..e22e1020 100644
--- a/test/integration/posix_memalign.c
+++ b/test/integration/posix_memalign.c
@@ -1,9 +1,20 @@
 #include "test/jemalloc_test.h"
 
 #define	CHUNK 0x400000
-/* #define MAXALIGN ((size_t)UINT64_C(0x80000000000)) */
-#define	MAXALIGN ((size_t)0x2000000LU)
-#define	NITER 4
+#define	MAXALIGN (((size_t)1) << 23)
+
+/*
+ * On systems which can't merge extents, tests that call this function generate
+ * a lot of dirty memory very quickly.  Purging between cycles mitigates
+ * potential OOM on e.g. 32-bit Windows.
+ */
+static void
+purge(void)
+{
+
+	assert_d_eq(mallctl("arena.0.purge", NULL, NULL, NULL, 0), 0,
+	    "Unexpected mallctl error");
+}
 
 TEST_BEGIN(test_alignment_errors)
 {
@@ -66,6 +77,7 @@ TEST_END
 
 TEST_BEGIN(test_alignment_and_size)
 {
+#define	NITER 4
 	size_t alignment, size, total;
 	unsigned i;
 	int err;
@@ -104,7 +116,9 @@ TEST_BEGIN(test_alignment_and_size)
 				}
 			}
 		}
+		purge();
 	}
+#undef NITER
 }
 TEST_END
 
diff --git a/test/src/mtx.c b/test/src/mtx.c
index 73bd02f6..8a5dfdd9 100644
--- a/test/src/mtx.c
+++ b/test/src/mtx.c
@@ -11,6 +11,8 @@ mtx_init(mtx_t *mtx)
 #ifdef _WIN32
 	if (!InitializeCriticalSectionAndSpinCount(&mtx->lock, _CRT_SPINCOUNT))
 		return (true);
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	mtx->lock = OS_UNFAIR_LOCK_INIT;
 #elif (defined(JEMALLOC_OSSPIN))
 	mtx->lock = 0;
 #else
@@ -33,6 +35,7 @@ mtx_fini(mtx_t *mtx)
 {
 
 #ifdef _WIN32
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
 #elif (defined(JEMALLOC_OSSPIN))
 #else
 	pthread_mutex_destroy(&mtx->lock);
@@ -45,6 +48,8 @@ mtx_lock(mtx_t *mtx)
 
 #ifdef _WIN32
 	EnterCriticalSection(&mtx->lock);
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	os_unfair_lock_lock(&mtx->lock);
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLockLock(&mtx->lock);
 #else
@@ -58,6 +63,8 @@ mtx_unlock(mtx_t *mtx)
 
 #ifdef _WIN32
 	LeaveCriticalSection(&mtx->lock);
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+	os_unfair_lock_unlock(&mtx->lock);
 #elif (defined(JEMALLOC_OSSPIN))
 	OSSpinLockUnlock(&mtx->lock);
 #else
diff --git a/test/unit/ckh.c b/test/unit/ckh.c
index 961e2acb..2cbc2268 100644
--- a/test/unit/ckh.c
+++ b/test/unit/ckh.c
@@ -2,24 +2,24 @@
 
 TEST_BEGIN(test_new_delete)
 {
-	tsdn_t *tsdn;
+	tsd_t *tsd;
 	ckh_t ckh;
 
-	tsdn = tsdn_fetch();
+	tsd = tsd_fetch();
 
-	assert_false(ckh_new(tsdn, &ckh, 2, ckh_string_hash,
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash,
 	    ckh_string_keycomp), "Unexpected ckh_new() error");
-	ckh_delete(tsdn, &ckh);
+	ckh_delete(tsd, &ckh);
 
-	assert_false(ckh_new(tsdn, &ckh, 3, ckh_pointer_hash,
+	assert_false(ckh_new(tsd, &ckh, 3, ckh_pointer_hash,
 	    ckh_pointer_keycomp), "Unexpected ckh_new() error");
-	ckh_delete(tsdn, &ckh);
+	ckh_delete(tsd, &ckh);
 }
 TEST_END
 
 TEST_BEGIN(test_count_insert_search_remove)
 {
-	tsdn_t *tsdn;
+	tsd_t *tsd;
 	ckh_t ckh;
 	const char *strs[] = {
 	    "a string",
@@ -30,9 +30,9 @@ TEST_BEGIN(test_count_insert_search_remove)
 	const char *missing = "A string not in the hash table.";
 	size_t i;
 
-	tsdn = tsdn_fetch();
+	tsd = tsd_fetch();
 
-	assert_false(ckh_new(tsdn, &ckh, 2, ckh_string_hash,
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_string_hash,
 	    ckh_string_keycomp), "Unexpected ckh_new() error");
 	assert_zu_eq(ckh_count(&ckh), 0,
 	    "ckh_count() should return %zu, but it returned %zu", ZU(0),
@@ -40,7 +40,7 @@ TEST_BEGIN(test_count_insert_search_remove)
 
 	/* Insert. */
 	for (i = 0; i < sizeof(strs)/sizeof(const char *); i++) {
-		ckh_insert(tsdn, &ckh, strs[i], strs[i]);
+		ckh_insert(tsd, &ckh, strs[i], strs[i]);
 		assert_zu_eq(ckh_count(&ckh), i+1,
 		    "ckh_count() should return %zu, but it returned %zu", i+1,
 		    ckh_count(&ckh));
@@ -85,7 +85,7 @@ TEST_BEGIN(test_count_insert_search_remove)
 		vp = (i & 2) ? &v.p : NULL;
 		k.p = NULL;
 		v.p = NULL;
-		assert_false(ckh_remove(tsdn, &ckh, strs[i], kp, vp),
+		assert_false(ckh_remove(tsd, &ckh, strs[i], kp, vp),
 		    "Unexpected ckh_remove() error");
 
 		ks = (i & 1) ? strs[i] : (const char *)NULL;
@@ -101,22 +101,22 @@ TEST_BEGIN(test_count_insert_search_remove)
 		    ckh_count(&ckh));
 	}
 
-	ckh_delete(tsdn, &ckh);
+	ckh_delete(tsd, &ckh);
 }
 TEST_END
 
 TEST_BEGIN(test_insert_iter_remove)
 {
 #define	NITEMS ZU(1000)
-	tsdn_t *tsdn;
+	tsd_t *tsd;
 	ckh_t ckh;
 	void **p[NITEMS];
 	void *q, *r;
 	size_t i;
 
-	tsdn = tsdn_fetch();
+	tsd = tsd_fetch();
 
-	assert_false(ckh_new(tsdn, &ckh, 2, ckh_pointer_hash,
+	assert_false(ckh_new(tsd, &ckh, 2, ckh_pointer_hash,
 	    ckh_pointer_keycomp), "Unexpected ckh_new() error");
 
 	for (i = 0; i < NITEMS; i++) {
@@ -128,7 +128,7 @@ TEST_BEGIN(test_insert_iter_remove)
 		size_t j;
 
 		for (j = i; j < NITEMS; j++) {
-			assert_false(ckh_insert(tsdn, &ckh, p[j], p[j]),
+			assert_false(ckh_insert(tsd, &ckh, p[j], p[j]),
 			    "Unexpected ckh_insert() failure");
 			assert_false(ckh_search(&ckh, p[j], &q, &r),
 			    "Unexpected ckh_search() failure");
@@ -143,13 +143,13 @@ TEST_BEGIN(test_insert_iter_remove)
 		for (j = i + 1; j < NITEMS; j++) {
 			assert_false(ckh_search(&ckh, p[j], NULL, NULL),
 			    "Unexpected ckh_search() failure");
-			assert_false(ckh_remove(tsdn, &ckh, p[j], &q, &r),
+			assert_false(ckh_remove(tsd, &ckh, p[j], &q, &r),
 			    "Unexpected ckh_remove() failure");
 			assert_ptr_eq(p[j], q, "Key pointer mismatch");
 			assert_ptr_eq(p[j], r, "Value pointer mismatch");
 			assert_true(ckh_search(&ckh, p[j], NULL, NULL),
 			    "Unexpected ckh_search() success");
-			assert_true(ckh_remove(tsdn, &ckh, p[j], &q, &r),
+			assert_true(ckh_remove(tsd, &ckh, p[j], &q, &r),
 			    "Unexpected ckh_remove() success");
 		}
 
@@ -184,13 +184,13 @@ TEST_BEGIN(test_insert_iter_remove)
 	for (i = 0; i < NITEMS; i++) {
 		assert_false(ckh_search(&ckh, p[i], NULL, NULL),
 		    "Unexpected ckh_search() failure");
-		assert_false(ckh_remove(tsdn, &ckh, p[i], &q, &r),
+		assert_false(ckh_remove(tsd, &ckh, p[i], &q, &r),
 		    "Unexpected ckh_remove() failure");
 		assert_ptr_eq(p[i], q, "Key pointer mismatch");
 		assert_ptr_eq(p[i], r, "Value pointer mismatch");
 		assert_true(ckh_search(&ckh, p[i], NULL, NULL),
 		    "Unexpected ckh_search() success");
-		assert_true(ckh_remove(tsdn, &ckh, p[i], &q, &r),
+		assert_true(ckh_remove(tsd, &ckh, p[i], &q, &r),
 		    "Unexpected ckh_remove() success");
 		dallocx(p[i], 0);
 	}
@@ -198,7 +198,7 @@ TEST_BEGIN(test_insert_iter_remove)
 	assert_zu_eq(ckh_count(&ckh), 0,
 	    "ckh_count() should return %zu, but it returned %zu",
 	    ZU(0), ckh_count(&ckh));
-	ckh_delete(tsdn, &ckh);
+	ckh_delete(tsd, &ckh);
 #undef NITEMS
 }
 TEST_END
diff --git a/test/unit/decay.c b/test/unit/decay.c
index 70a2e67a..e169ae24 100644
--- a/test/unit/decay.c
+++ b/test/unit/decay.c
@@ -2,20 +2,28 @@
 
 const char *malloc_conf = "purge:decay,decay_time:1";
 
+static nstime_monotonic_t *nstime_monotonic_orig;
 static nstime_update_t *nstime_update_orig;
 
 static unsigned nupdates_mock;
 static nstime_t time_mock;
-static bool nonmonotonic_mock;
+static bool monotonic_mock;
+
+static bool
+nstime_monotonic_mock(void)
+{
+
+	return (monotonic_mock);
+}
 
 static bool
 nstime_update_mock(nstime_t *time)
 {
 
 	nupdates_mock++;
-	if (!nonmonotonic_mock)
+	if (monotonic_mock)
 		nstime_copy(time, &time_mock);
-	return (nonmonotonic_mock);
+	return (!monotonic_mock);
 }
 
 TEST_BEGIN(test_decay_ticks)
@@ -245,9 +253,11 @@ TEST_BEGIN(test_decay_ticker)
 	nupdates_mock = 0;
 	nstime_init(&time_mock, 0);
 	nstime_update(&time_mock);
-	nonmonotonic_mock = false;
+	monotonic_mock = true;
 
+	nstime_monotonic_orig = nstime_monotonic;
 	nstime_update_orig = nstime_update;
+	nstime_monotonic = nstime_monotonic_mock;
 	nstime_update = nstime_update_mock;
 
 	for (i = 0; i < NPS; i++) {
@@ -259,6 +269,7 @@ TEST_BEGIN(test_decay_ticker)
 		    "Expected nstime_update() to be called");
 	}
 
+	nstime_monotonic = nstime_monotonic_orig;
 	nstime_update = nstime_update_orig;
 
 	nstime_init(&time, 0);
@@ -316,9 +327,11 @@ TEST_BEGIN(test_decay_nonmonotonic)
 	nupdates_mock = 0;
 	nstime_init(&time_mock, 0);
 	nstime_update(&time_mock);
-	nonmonotonic_mock = true;
+	monotonic_mock = false;
 
+	nstime_monotonic_orig = nstime_monotonic;
 	nstime_update_orig = nstime_update;
+	nstime_monotonic = nstime_monotonic_mock;
 	nstime_update = nstime_update_mock;
 
 	for (i = 0; i < NPS; i++) {
@@ -342,8 +355,9 @@ TEST_BEGIN(test_decay_nonmonotonic)
 	    config_stats ? 0 : ENOENT, "Unexpected mallctl result");
 
 	if (config_stats)
-		assert_u64_gt(npurge1, npurge0, "Expected purging to occur");
+		assert_u64_eq(npurge0, npurge1, "Unexpected purging occurred");
 
+	nstime_monotonic = nstime_monotonic_orig;
 	nstime_update = nstime_update_orig;
 #undef NPS
 }
diff --git a/test/unit/fork.c b/test/unit/fork.c
index 46c815ef..c530797c 100644
--- a/test/unit/fork.c
+++ b/test/unit/fork.c
@@ -26,7 +26,7 @@ TEST_BEGIN(test_fork)
 		test_fail("Unexpected fork() failure");
 	} else if (pid == 0) {
 		/* Child. */
-		exit(0);
+		_exit(0);
 	} else {
 		int status;
 
diff --git a/test/unit/junk.c b/test/unit/junk.c
index acddc601..460bd524 100644
--- a/test/unit/junk.c
+++ b/test/unit/junk.c
@@ -53,10 +53,10 @@ arena_dalloc_junk_large_intercept(void *ptr, size_t usize)
 }
 
 static void
-huge_dalloc_junk_intercept(tsdn_t *tsdn, void *ptr, size_t usize)
+huge_dalloc_junk_intercept(void *ptr, size_t usize)
 {
 
-	huge_dalloc_junk_orig(tsdn, ptr, usize);
+	huge_dalloc_junk_orig(ptr, usize);
 	/*
 	 * The conditions under which junk filling actually occurs are nuanced
 	 * enough that it doesn't make sense to duplicate the decision logic in
diff --git a/test/unit/math.c b/test/unit/math.c
index ebec77a6..adb72bed 100644
--- a/test/unit/math.c
+++ b/test/unit/math.c
@@ -5,6 +5,10 @@
 
 #include <float.h>
 
+#ifdef __PGI
+#undef INFINITY
+#endif
+
 #ifndef INFINITY
 #define	INFINITY (DBL_MAX + DBL_MAX)
 #endif
diff --git a/test/unit/nstime.c b/test/unit/nstime.c
index cd7d9a6d..0368bc26 100644
--- a/test/unit/nstime.c
+++ b/test/unit/nstime.c
@@ -176,6 +176,13 @@ TEST_BEGIN(test_nstime_divide)
 }
 TEST_END
 
+TEST_BEGIN(test_nstime_monotonic)
+{
+
+	nstime_monotonic();
+}
+TEST_END
+
 TEST_BEGIN(test_nstime_update)
 {
 	nstime_t nst;
@@ -198,7 +205,6 @@ TEST_BEGIN(test_nstime_update)
 		assert_d_eq(nstime_compare(&nst, &nst0), 0,
 		    "Time should not have been modified");
 	}
-
 }
 TEST_END
 
@@ -216,5 +222,6 @@ main(void)
 	    test_nstime_imultiply,
 	    test_nstime_idivide,
 	    test_nstime_divide,
+	    test_nstime_monotonic,
 	    test_nstime_update));
 }
diff --git a/test/unit/run_quantize.c b/test/unit/run_quantize.c
index f6a2f74f..45f32018 100644
--- a/test/unit/run_quantize.c
+++ b/test/unit/run_quantize.c
@@ -111,7 +111,7 @@ TEST_BEGIN(test_monotonic)
 
 	floor_prev = 0;
 	ceil_prev = 0;
-	for (i = 1; i < run_quantize_max >> LG_PAGE; i++) {
+	for (i = 1; i <= large_maxclass >> LG_PAGE; i++) {
 		size_t run_size, floor, ceil;
 
 		run_size = i << LG_PAGE;
diff --git a/test/unit/size_classes.c b/test/unit/size_classes.c
index 2e2caaf5..4e1e0ce4 100644
--- a/test/unit/size_classes.c
+++ b/test/unit/size_classes.c
@@ -80,25 +80,96 @@ TEST_BEGIN(test_size_classes)
 }
 TEST_END
 
+TEST_BEGIN(test_psize_classes)
+{
+	size_t size_class, max_size_class;
+	pszind_t pind, max_pind;
+
+	max_size_class = get_max_size_class();
+	max_pind = psz2ind(max_size_class);
+
+	for (pind = 0, size_class = pind2sz(pind); pind < max_pind ||
+	    size_class < max_size_class; pind++, size_class =
+	    pind2sz(pind)) {
+		assert_true(pind < max_pind,
+		    "Loop conditionals should be equivalent; pind=%u, "
+		    "size_class=%zu (%#zx)", pind, size_class, size_class);
+		assert_true(size_class < max_size_class,
+		    "Loop conditionals should be equivalent; pind=%u, "
+		    "size_class=%zu (%#zx)", pind, size_class, size_class);
+
+		assert_u_eq(pind, psz2ind(size_class),
+		    "psz2ind() does not reverse pind2sz(): pind=%u -->"
+		    " size_class=%zu --> pind=%u --> size_class=%zu", pind,
+		    size_class, psz2ind(size_class),
+		    pind2sz(psz2ind(size_class)));
+		assert_zu_eq(size_class, pind2sz(psz2ind(size_class)),
+		    "pind2sz() does not reverse psz2ind(): pind=%u -->"
+		    " size_class=%zu --> pind=%u --> size_class=%zu", pind,
+		    size_class, psz2ind(size_class),
+		    pind2sz(psz2ind(size_class)));
+
+		assert_u_eq(pind+1, psz2ind(size_class+1),
+		    "Next size_class does not round up properly");
+
+		assert_zu_eq(size_class, (pind > 0) ?
+		    psz2u(pind2sz(pind-1)+1) : psz2u(1),
+		    "psz2u() does not round up to size class");
+		assert_zu_eq(size_class, psz2u(size_class-1),
+		    "psz2u() does not round up to size class");
+		assert_zu_eq(size_class, psz2u(size_class),
+		    "psz2u() does not compute same size class");
+		assert_zu_eq(psz2u(size_class+1), pind2sz(pind+1),
+		    "psz2u() does not round up to next size class");
+	}
+
+	assert_u_eq(pind, psz2ind(pind2sz(pind)),
+	    "psz2ind() does not reverse pind2sz()");
+	assert_zu_eq(max_size_class, pind2sz(psz2ind(max_size_class)),
+	    "pind2sz() does not reverse psz2ind()");
+
+	assert_zu_eq(size_class, psz2u(pind2sz(pind-1)+1),
+	    "psz2u() does not round up to size class");
+	assert_zu_eq(size_class, psz2u(size_class-1),
+	    "psz2u() does not round up to size class");
+	assert_zu_eq(size_class, psz2u(size_class),
+	    "psz2u() does not compute same size class");
+}
+TEST_END
+
 TEST_BEGIN(test_overflow)
 {
 	size_t max_size_class;
 
 	max_size_class = get_max_size_class();
 
-	assert_u_ge(size2index(max_size_class+1), NSIZES,
-	    "size2index() should return >= NSIZES on overflow");
-	assert_u_ge(size2index(ZU(PTRDIFF_MAX)+1), NSIZES,
-	    "size2index() should return >= NSIZES on overflow");
-	assert_u_ge(size2index(SIZE_T_MAX), NSIZES,
-	    "size2index() should return >= NSIZES on overflow");
+	assert_u_eq(size2index(max_size_class+1), NSIZES,
+	    "size2index() should return NSIZES on overflow");
+	assert_u_eq(size2index(ZU(PTRDIFF_MAX)+1), NSIZES,
+	    "size2index() should return NSIZES on overflow");
+	assert_u_eq(size2index(SIZE_T_MAX), NSIZES,
+	    "size2index() should return NSIZES on overflow");
 
-	assert_zu_gt(s2u(max_size_class+1), HUGE_MAXCLASS,
-	    "s2u() should return > HUGE_MAXCLASS for unsupported size");
-	assert_zu_gt(s2u(ZU(PTRDIFF_MAX)+1), HUGE_MAXCLASS,
-	    "s2u() should return > HUGE_MAXCLASS for unsupported size");
+	assert_zu_eq(s2u(max_size_class+1), 0,
+	    "s2u() should return 0 for unsupported size");
+	assert_zu_eq(s2u(ZU(PTRDIFF_MAX)+1), 0,
+	    "s2u() should return 0 for unsupported size");
 	assert_zu_eq(s2u(SIZE_T_MAX), 0,
 	    "s2u() should return 0 on overflow");
+
+	assert_u_eq(psz2ind(max_size_class+1), NPSIZES,
+	    "psz2ind() should return NPSIZES on overflow");
+	assert_u_eq(psz2ind(ZU(PTRDIFF_MAX)+1), NPSIZES,
+	    "psz2ind() should return NPSIZES on overflow");
+	assert_u_eq(psz2ind(SIZE_T_MAX), NPSIZES,
+	    "psz2ind() should return NPSIZES on overflow");
+
+	assert_zu_eq(psz2u(max_size_class+1), 0,
+	    "psz2u() should return 0 for unsupported size");
+	assert_zu_eq(psz2u(ZU(PTRDIFF_MAX)+1), 0,
+	    "psz2u() should return 0 for unsupported size");
+	assert_zu_eq(psz2u(SIZE_T_MAX), 0,
+	    "psz2u() should return 0 on overflow");
 }
 TEST_END
 
@@ -108,5 +179,6 @@ main(void)
 
 	return (test(
 	    test_size_classes,
+	    test_psize_classes,
 	    test_overflow));
 }
diff --git a/test/unit/tsd.c b/test/unit/tsd.c
index 7dde4b77..4e2622a3 100644
--- a/test/unit/tsd.c
+++ b/test/unit/tsd.c
@@ -58,18 +58,18 @@ thd_start(void *arg)
 	data_t d = (data_t)(uintptr_t)arg;
 	void *p;
 
-	assert_x_eq(*data_tsd_get(), DATA_INIT,
+	assert_x_eq(*data_tsd_get(true), DATA_INIT,
 	    "Initial tsd get should return initialization value");
 
 	p = malloc(1);
 	assert_ptr_not_null(p, "Unexpected malloc() failure");
 
 	data_tsd_set(&d);
-	assert_x_eq(*data_tsd_get(), d,
+	assert_x_eq(*data_tsd_get(true), d,
 	    "After tsd set, tsd get should return value that was set");
 
 	d = 0;
-	assert_x_eq(*data_tsd_get(), (data_t)(uintptr_t)arg,
+	assert_x_eq(*data_tsd_get(true), (data_t)(uintptr_t)arg,
 	    "Resetting local data should have no effect on tsd");
 
 	free(p);