Update ChangeLog for release 5.3.1

Fix opt.max_background_threads default in docs
Documentation updates (#2869 )
2026-04-14 22:51:50 +03:00 · 2026-04-13 17:12:37 -07:00 · 2026-04-13 14:46:53 -07:00 · 2026-04-07 10:41:44 -07:00 · 2026-04-01 23:15:19 -04:00 · 2026-04-01 23:15:19 -04:00
548 changed files with 103673 additions and 36738 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -5,24 +5,47 @@ environment:
  - MSYSTEM: MINGW64
    CPU: x86_64
    MSVC: amd64
+    CONFIG_FLAGS: --enable-debug
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    CONFIG_FLAGS: --enable-debug
+    EXTRA_CFLAGS: "-fcommon"
  - MSYSTEM: MINGW32
    CPU: i686
    MSVC: x86
-  - MSYSTEM: MINGW64
-    CPU: x86_64
+    CONFIG_FLAGS: --enable-debug
  - MSYSTEM: MINGW32
    CPU: i686
+    CONFIG_FLAGS: --enable-debug
+    EXTRA_CFLAGS: "-fcommon"
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    MSVC: amd64
+    CONFIG_FLAGS:
+  - MSYSTEM: MINGW64
+    CPU: x86_64
+    CONFIG_FLAGS:
+    EXTRA_CFLAGS: "-fcommon"
+  - MSYSTEM: MINGW32
+    CPU: i686
+    MSVC: x86
+    CONFIG_FLAGS:
+  - MSYSTEM: MINGW32
+    CPU: i686
+    CONFIG_FLAGS:
+    EXTRA_CFLAGS: "-fcommon"

 install:
  - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH%
  - if defined MSVC call "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %MSVC%
  - if defined MSVC pacman --noconfirm -Rsc mingw-w64-%CPU%-gcc gcc
-  - pacman --noconfirm -Suy mingw-w64-%CPU%-make
+  - pacman --noconfirm -Syuu
+  - pacman --noconfirm -S autoconf

 build_script:
  - bash -c "autoconf"
-  - bash -c "./configure"
-  - mingw32-make -j3
+  - bash -c "./configure $CONFIG_FLAGS"
+  - mingw32-make
  - file lib/jemalloc.dll
-  - mingw32-make -j3 tests
+  - mingw32-make tests
  - mingw32-make -k check
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,122 @@
+# jemalloc targets clang-format version 8.  We include every option it supports
+# here, but comment out the ones that aren't relevant for us.
+---
+# AccessModifierOffset: -2
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: true
+AlignEscapedNewlines: Right
+AlignOperands: false
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: AllDefinitions
+AlwaysBreakBeforeMultilineStrings: true
+# AlwaysBreakTemplateDeclarations: Yes
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: true
+  AfterStruct: true
+  AfterUnion: true
+  BeforeCatch: true
+  BeforeElse: true
+  IndentBraces: false
+# BreakAfterJavaFieldAnnotations: true
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+# BreakConstructorInitializers: BeforeColon
+# BreakInheritanceList: BeforeColon
+BreakStringLiterals: false
+ColumnLimit: 80
+# CommentPragmas: ''
+# CompactNamespaces: true
+# ConstructorInitializerAllOnOneLineOrOnePerLine: true
+# ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   [ ql_foreach, qr_foreach, ]
+# IncludeBlocks: Preserve
+# IncludeCategories:
+#   - Regex:           '^<.*\.h(pp)?>'
+#     Priority:        1
+# IncludeIsMainRegex: ''
+IndentCaseLabels: false
+IndentPPDirectives: AfterHash
+IndentWidth: 8
+IndentWrappedFunctionNames: false
+# JavaImportGroups: []
+# JavaScriptQuotes: Leave
+# JavaScriptWrapImports: True
+KeepEmptyLinesAtTheStartOfBlocks: false
+Language: Cpp
+MacroBlockBegin: ''
+MacroBlockEnd: ''
+MaxEmptyLinesToKeep: 1
+# NamespaceIndentation: None
+# ObjCBinPackProtocolList: Auto
+# ObjCBlockIndentWidth: 2
+# ObjCSpaceAfterProperty: false
+# ObjCSpaceBeforeProtocolList: false
+
+PenaltyBreakAssignment: 100
+PenaltyBreakBeforeFirstCallParameter: 100
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+# PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Right
+# RawStringFormats:
+#   - Language: TextProto
+#       Delimiters:
+#         - 'pb'
+#         - 'proto'
+#       EnclosingFunctions:
+#         - 'PARSE_TEXT_PROTO'
+#       BasedOnStyle: google
+#   - Language: Cpp
+#       Delimiters:
+#         - 'cc'
+#         - 'cpp'
+#       BasedOnStyle: llvm
+#       CanonicalDelimiter: 'cc'
+ReflowComments: false
+SortIncludes: false
+SpaceAfterCStyleCast: false
+# SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+# SpaceBeforeCpp11BracedList: false
+# SpaceBeforeCtorInitializerColon: true
+# SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+# SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInCStyleCastParentheses: false
+# SpacesInContainerLiterals: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+# Standard: Cpp11
+# This is nominally supported in clang-format version 8, but not in the build
+# used by some of the core jemalloc developers.
+# StatementMacros: []
+TabWidth: 8
+UseTab: ForIndentation
+...
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,2 @@
+554185356bf990155df8d72060c4efe993642baf
+34f359e0ca613b5f9d970e9b2152a5203c9df8d6
--- a/.github/workflows/check_formatting.yaml
+++ b/.github/workflows/check_formatting.yaml
@ -0,0 +1,10 @@
+name: 'Check Formatting'
+on: [pull_request]
+jobs:
+  check-formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: Check for trailing whitespace
+        run: scripts/check_trailing_whitespace.sh
--- a/.github/workflows/freebsd-ci.yml
+++ b/.github/workflows/freebsd-ci.yml
@ -0,0 +1,66 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: FreeBSD CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-freebsd:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        debug: ['--enable-debug', '--disable-debug']
+        prof: ['--enable-prof', '--disable-prof']
+        arch: ['64-bit', '32-bit']
+        uncommon:
+          - ''
+          - '--with-lg-page=16 --with-malloc-conf=tcache:false'
+
+    name: FreeBSD (${{ matrix.arch }}, debug=${{ matrix.debug }}, prof=${{ matrix.prof }}${{ matrix.uncommon && ', uncommon' || '' }})
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+
+    - name: Test on FreeBSD
+      uses: vmactions/freebsd-vm@v1
+      with:
+        release: '15.0'
+        usesh: true
+        prepare: |
+          pkg install -y autoconf gmake
+        run: |
+          # Verify we're running in FreeBSD
+          echo "==== System Information ===="
+          uname -a
+          freebsd-version
+          echo "============================"
+
+          # Set compiler flags for 32-bit if needed
+          if [ "${{ matrix.arch }}" = "32-bit" ]; then
+            export CC="cc -m32"
+            export CXX="c++ -m32"
+          fi
+
+          # Generate configure script
+          autoconf
+
+          # Configure with matrix options
+          ./configure --with-jemalloc-prefix=ci_ ${{ matrix.debug }} ${{ matrix.prof }} ${{ matrix.uncommon }}
+
+          # Get CPU count for parallel builds
+          export JFLAG=$(sysctl -n kern.smp.cpus)
+
+          gmake -j${JFLAG}
+          gmake -j${JFLAG} tests
+          gmake check
+
+
+
--- a/.github/workflows/linux-ci.yml
+++ b/.github/workflows/linux-ci.yml
@ -0,0 +1,695 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: Linux CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-linux:
+    runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-stats"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --disable-stats"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary,percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== System Information ==="
+        uname -a
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== OS Release ==="
+        cat /etc/os-release || true
+        echo ""
+        echo "=== CPU Info ==="
+        lscpu | grep -E "Architecture|CPU op-mode|Byte Order|CPU\(s\):" || true
+
+    - name: Install dependencies (32-bit)
+      if: matrix.env.CROSS_COMPILE_32BIT == 'yes'
+      run: |
+        sudo dpkg --add-architecture i386
+        sudo apt-get update
+        sudo apt-get install -y gcc-multilib g++-multilib libc6-dev-i386
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC }}
+        CXX: ${{ matrix.env.CXX }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Verify the script generates the same output
+        ./scripts/gen_gh_actions.py > gh_actions_script.yml
+
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+  test-linux-arm64:
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-lg-hugepage=29"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== System Information ==="
+        uname -a
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== OS Release ==="
+        cat /etc/os-release || true
+        echo ""
+        echo "=== CPU Info ==="
+        lscpu | grep -E "Architecture|CPU op-mode|Byte Order|CPU\(s\):" || true
+
+    - name: Install dependencies (32-bit)
+      if: matrix.env.CROSS_COMPILE_32BIT == 'yes'
+      run: |
+        sudo dpkg --add-architecture i386
+        sudo apt-get update
+        sudo apt-get install -y gcc-multilib g++-multilib libc6-dev-i386
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC }}
+        CXX: ${{ matrix.env.CXX }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Verify the script generates the same output
+        ./scripts/gen_gh_actions.py > gh_actions_script.yml
+
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+
--- a/.github/workflows/macos-ci.yml
+++ b/.github/workflows/macos-ci.yml
@ -0,0 +1,212 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: macOS CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-macos:
+    runs-on: macos-15-intel
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== macOS Version ==="
+        sw_vers
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== CPU Info ==="
+        sysctl -n machdep.cpu.brand_string
+        sysctl -n hw.machine
+
+    - name: Install dependencies
+      run: |
+        brew install autoconf
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+  test-macos-arm64:
+    runs-on: macos-15
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-lg-hugepage=29"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== macOS Version ==="
+        sw_vers
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== CPU Info ==="
+        sysctl -n machdep.cpu.brand_string
+        sysctl -n hw.machine
+
+    - name: Install dependencies
+      run: |
+        brew install autoconf
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+
--- a/.github/workflows/static_analysis.yaml
+++ b/.github/workflows/static_analysis.yaml
@ -0,0 +1,68 @@
+name: 'Static Analysis'
+on: [pull_request]
+jobs:
+  static-analysis:
+    runs-on: ubuntu-latest
+    steps:
+      # We build libunwind ourselves because sadly the version
+      # provided by Ubuntu via apt-get is much too old.
+      - name: Check out libunwind
+        uses: actions/checkout@v4
+        with:
+          repository: libunwind/libunwind
+          path: libunwind
+          ref: 'v1.6.2'
+          github-server-url: 'https://github.com'
+      - name: Install libunwind
+        run: |
+          cd libunwind
+          autoreconf -i
+          ./configure --prefix=/usr
+          make -s -j $(nproc) V=0
+          sudo make -s install V=0
+          cd ..
+          rm -rf libunwind
+      - name: Check out repository
+        uses: actions/checkout@v4
+      # We download LLVM directly from the latest stable release
+      # on GitHub, because this tends to be much newer than the
+      # version available via apt-get in Ubuntu.
+      - name: Download LLVM
+        uses: dsaltares/fetch-gh-release-asset@master
+        with:
+          repo: 'llvm/llvm-project'
+          version: 'tags/llvmorg-16.0.4'
+          file: 'clang[+]llvm-.*x86_64-linux-gnu.*'
+          regex: true
+          target: 'llvm_assets/'
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install prerequisites
+        id: install_prerequisites
+        run: |
+          tar -C llvm_assets -xaf llvm_assets/*.tar* &
+          sudo apt-get update
+          sudo apt-get install -y jq bear python3-pip
+          pip install codechecker
+          echo "Extracting LLVM from tar" 1>&2
+          wait
+          echo "LLVM_BIN_DIR=$(echo llvm_assets/clang*/bin)" >> "$GITHUB_OUTPUT"
+      - name: Run static analysis
+        id: run_static_analysis
+        run: >
+          PATH="${{ steps.install_prerequisites.outputs.LLVM_BIN_DIR }}:$PATH"
+          LDFLAGS='-L/usr/lib'
+          scripts/run_static_analysis.sh static_analysis_results "$GITHUB_OUTPUT"
+      - name: Upload static analysis results
+        if: ${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }} == '1'
+        uses: actions/upload-artifact@v4
+        with:
+          name: static_analysis_results
+          path: static_analysis_results
+      - name: Check static analysis results
+        run: |
+          if [[ "${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }}" == '1' ]]
+          then
+              echo "::error::Static analysis found issues with your code. Download the 'static_analysis_results' artifact from this workflow and view the 'index.html' file contained within it in a web browser locally for detailed results."
+              exit 1
+          fi
+
--- a/.github/workflows/windows-ci.yml
+++ b/.github/workflows/windows-ci.yml
@ -0,0 +1,155 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: Windows CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-windows:
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CONFIGURE_FLAGS: --enable-debug
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CROSS_COMPILE_32BIT: yes
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CROSS_COMPILE_32BIT: yes
+              CONFIGURE_FLAGS: --enable-debug
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      shell: cmd
+      run: |
+        echo === Windows Version ===
+        systeminfo | findstr /B /C:"OS Name" /C:"OS Version"
+        ver
+        echo.
+        echo === Architecture ===
+        echo PROCESSOR_ARCHITECTURE=%PROCESSOR_ARCHITECTURE%
+        echo.
+
+    - name: Setup MSYS2
+      uses: msys2/setup-msys2@v2
+      with:
+        msystem: ${{ matrix.env.CROSS_COMPILE_32BIT == 'yes' && 'MINGW32' || 'MINGW64' }}
+        update: true
+        install: >-
+          autotools
+          git
+        pacboy: >-
+          make:p
+          gcc:p
+          binutils:p
+
+    - name: Build and test (MinGW-GCC)
+      if: matrix.env.CC != 'cl.exe'
+      shell: msys2 {0}
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build (mingw32-make is the "make" command in MSYS2)
+        mingw32-make -j3
+        mingw32-make tests
+
+        # Run tests
+        mingw32-make -k check
+
+    - name: Setup MSVC environment
+      if: matrix.env.CC == 'cl.exe'
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: ${{ matrix.env.CROSS_COMPILE_32BIT == 'yes' && 'x86' || 'x64' }}
+
+    - name: Build and test (MSVC)
+      if: matrix.env.CC == 'cl.exe'
+      shell: msys2 {0}
+      env:
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        MSYS2_PATH_TYPE: inherit
+      run: |
+        # Export MSVC environment variables for configure
+        export CC=cl.exe
+        export CXX=cl.exe
+        export AR=lib.exe
+        export NM=dumpbin.exe
+        export RANLIB=:
+
+        # Verify cl.exe is accessible (should be in PATH via inherit)
+        if ! which cl.exe > /dev/null 2>&1; then
+          echo "cl.exe not found, trying to locate MSVC..."
+          # Find and add MSVC bin directory to PATH
+          MSVC_BIN=$(cmd.exe /c "echo %VCToolsInstallDir%" | tr -d '\\r' | sed 's/\\\\\\\\/\//g' | sed 's/C:/\\/c/g')
+          if [ -n "$MSVC_BIN" ]; then
+            export PATH="$PATH:$MSVC_BIN/bin/Hostx64/x64:$MSVC_BIN/bin/Hostx86/x86"
+          fi
+        fi
+
+        # Run autoconf
+        autoconf
+
+        # Configure with MSVC
+        ./configure CC=cl.exe CXX=cl.exe AR=lib.exe $CONFIGURE_FLAGS
+
+        # Build (mingw32-make is the "make" command in MSYS2)
+        mingw32-make -j3
+        # Build tests sequentially due to PDB file issues
+        mingw32-make tests
+
+        # Run tests
+        mingw32-make -k check
+
+
+
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,3 @@
-/*.gcov.*
-
 /bin/jemalloc-config
 /bin/jemalloc.sh
 /bin/jeprof
@ -15,20 +13,25 @@
 /doc/jemalloc.html
 /doc/jemalloc.3

+/doc_internal/PROFILING_INTERNALS.pdf
+
 /jemalloc.pc

 /lib/

 /Makefile

-/include/jemalloc/internal/jemalloc_internal.h
+/include/jemalloc/internal/jemalloc_preamble.h
 /include/jemalloc/internal/jemalloc_internal_defs.h
+/include/jemalloc/internal/private_namespace.gen.h
 /include/jemalloc/internal/private_namespace.h
-/include/jemalloc/internal/private_unnamespace.h
+/include/jemalloc/internal/private_namespace_jet.gen.h
+/include/jemalloc/internal/private_namespace_jet.h
+/include/jemalloc/internal/private_symbols.awk
+/include/jemalloc/internal/private_symbols_jet.awk
 /include/jemalloc/internal/public_namespace.h
 /include/jemalloc/internal/public_symbols.txt
 /include/jemalloc/internal/public_unnamespace.h
-/include/jemalloc/internal/size_classes.h
 /include/jemalloc/jemalloc.h
 /include/jemalloc/jemalloc_defs.h
 /include/jemalloc/jemalloc_macros.h
@ -40,49 +43,63 @@
 /include/jemalloc/jemalloc_typedefs.h

 /src/*.[od]
-/src/*.gcda
-/src/*.gcno
+/src/*.sym
+
+# These are semantically meaningful for clangd and related tooling.
+/build/
+/.cache/
+compile_commands.json
+/static_analysis_raw_results
+/static_analysis_results
+
+/run_tests.out/

 /test/test.sh
 test/include/test/jemalloc_test.h
 test/include/test/jemalloc_test_defs.h

 /test/integration/[A-Za-z]*
+!/test/integration/cpp/
 !/test/integration/[A-Za-z]*.*
 /test/integration/*.[od]
-/test/integration/*.gcda
-/test/integration/*.gcno
 /test/integration/*.out

+/test/integration/cpp/[A-Za-z]*
+!/test/integration/cpp/[A-Za-z]*.*
+/test/integration/cpp/*.[od]
+/test/integration/cpp/*.out
+
 /test/src/*.[od]
-/test/src/*.gcda
-/test/src/*.gcno

 /test/stress/[A-Za-z]*
 !/test/stress/[A-Za-z]*.*
+!/test/stress/pa/
 /test/stress/*.[od]
-/test/stress/*.gcda
-/test/stress/*.gcno
 /test/stress/*.out

 /test/unit/[A-Za-z]*
 !/test/unit/[A-Za-z]*.*
 /test/unit/*.[od]
-/test/unit/*.gcda
-/test/unit/*.gcno
 /test/unit/*.out

+/test/analyze/[A-Za-z]*
+!/test/analyze/[A-Za-z]*.*
+/test/analyze/*.[od]
+/test/analyze/*.out
+
 /VERSION

 *.pdb
 *.sdf
 *.opendb
+*.VC.db
 *.opensdf
 *.cachefile
 *.suo
 *.user
 *.sln.docstates
 *.tmp
+.vs/
 /msvc/Win32/
 /msvc/x64/
 /msvc/projects/*/*/Debug*/
--- a/.travis.yml
+++ b/.travis.yml
@ -1,29 +1,365 @@
-language: c
+# This config file is generated by ./scripts/gen_travis.py.
+# Do not edit by hand.

-matrix:
+# We use 'minimal', because 'generic' makes Windows VMs hang at startup. Also
+# the software provided by 'generic' is simply not needed for our tests.
+# Differences are explained here:
+# https://docs.travis-ci.com/user/languages/minimal-and-generic/
+language: minimal
+dist: jammy
+
+jobs:
  include:
    - os: linux
-      compiler: gcc
+      arch: amd64
+      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      compiler: gcc
-      env:
-        - EXTRA_FLAGS=-m32
-      addons:
-        apt:
-          packages:
-          - gcc-multilib
-    - os: osx
-      compiler: clang
-    - os: osx
-      compiler: clang
-      env:
-        - EXTRA_FLAGS=-m32
+      arch: amd64
+      env: CC=clang CXX=clang++ EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary,percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=clang CXX=clang++ EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-lg-hugepage=29" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # Development build
+    - os: linux
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    # --enable-expermental-smallocx:
+    - os: linux
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+
+
+before_install:
+  - |-
+    if test -f "./scripts/$TRAVIS_OS_NAME/before_install.sh"; then
+      source ./scripts/$TRAVIS_OS_NAME/before_install.sh
+    fi

 before_script:
-  - autoconf
-  - ./configure${EXTRA_FLAGS:+ CC="$CC $EXTRA_FLAGS"}
-  - make -j3
-  - make -j3 tests
+  - |-
+    if test -f "./scripts/$TRAVIS_OS_NAME/before_script.sh"; then
+      source ./scripts/$TRAVIS_OS_NAME/before_script.sh
+    else
+      scripts/gen_travis.py > travis_script && diff .travis.yml travis_script
+      autoconf
+      # If COMPILER_FLAGS are not empty, add them to CC and CXX
+      ./configure ${COMPILER_FLAGS:+ CC="$CC $COMPILER_FLAGS" CXX="$CXX $COMPILER_FLAGS"} $CONFIGURE_FLAGS
+      make -j3
+      make -j3 tests
+    fi

 script:
-  - make check
+  - |-
+    if test -f "./scripts/$TRAVIS_OS_NAME/script.sh"; then
+      source ./scripts/$TRAVIS_OS_NAME/script.sh
+    else
+      make check
+    fi
+
--- a/4
+++ b/4
@ -1,10 +1,10 @@
 Unless otherwise specified, files in the jemalloc source distribution are
 subject to the following license:
 --------------------------------------------------------------------------------
-Copyright (C) 2002-2016 Jason Evans <jasone@canonware.com>.
+Copyright (C) 2002-present Jason Evans <jasone@canonware.com>.
 All rights reserved.
 Copyright (C) 2007-2012 Mozilla Foundation.  All rights reserved.
-Copyright (C) 2009-2016 Facebook, Inc.  All rights reserved.
+Copyright (C) 2009-present Facebook, Inc.  All rights reserved.

 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are met:
--- a/787
+++ b/787
@ -4,6 +4,791 @@ brevity.  Much more detail can be found in the git revision history:

    https://github.com/jemalloc/jemalloc

+* 5.3.1 (Apr 13, 2026)
+
+This release includes over 390 commits spanning bug fixes, new features,
+performance optimizations, and portability improvements.  Multiple percent
+of system-level metric improvements were measured in tested production
+workloads.  The release has gone through large-scale production testing
+at Meta.
+
+New features:
+  - Support pvalloc.  (@Lapenkov: 5b1f2cc5)
+  - Add double free detection for the debug build.  (@izaitsevfb:
+    36366f3c, @guangli-dai: 42daa1ac, @divanorama: 1897f185)
+  - Add compile-time option `--enable-pageid` to enable memory mapping
+    annotation.  (@devnexen: 4fc5c4fb)
+  - Add runtime option `prof_bt_max` to control the max stack depth for
+    profiling.  (@guangli-dai: a0734fd6)
+  - Add compile-time option `--enable-force-getenv` to use `getenv` instead
+    of `secure_getenv`.  (@interwq: 481bbfc9)
+  - Add compile-time option `--disable-dss` to disable the usage of
+    `sbrk(2)`.  (@Svetlitski: ea5b7bea)
+  - Add runtime option `tcache_ncached_max` to control the number of items
+    in each size bin in the thread cache.  (@guangli-dai: 8a22d10b)
+  - Add runtime option `calloc_madvise_threshold` to determine if kernel or
+    memset is used to zero the allocations for calloc.  (@nullptr0-0:
+    5081c16b)
+  - Add compile-time option `--disable-user-config` to disable reading the
+    runtime configurations from `/etc/malloc.conf` or environment variable
+    `MALLOC_CONF`.  (@roblabla: c17bf8b3)
+  - Add runtime option `disable_large_size_classes` to guard the new usable
+    size calculation, which minimizes the memory overhead for large
+    allocations, i.e., >= 4 * PAGE.  (@guangli-dai: c067a55c, 8347f104)
+  - Enable process_madvise usage, add runtime option
+    `process_madvise_max_batch` to control the max # of regions in each
+    madvise batch.  (@interwq: 22440a02, @spredolac: 4246475b)
+  - Add mallctl interfaces:
+    + `opt.prof_bt_max`  (@guangli-dai: a0734fd6)
+    + `arena.<i>.name` to set and get arena names.  (@guangli-dai: ba19d2cb)
+    + `thread.tcache.max` to set and get the `tcache_max` of the current
+      thread.  (@guangli-dai: a442d9b8)
+    + `thread.tcache.ncached_max.write` and
+      `thread.tcache.ncached_max.read_sizeclass` to set and get the
+      `ncached_max` setup of the current thread.  (@guangli-dai: 630f7de9,
+      6b197fdd)
+    + `arenas.hugepage` to return the hugepage size used, also exported to
+      malloc stats.  (@ilvokhin: 90c627ed)
+    + `approximate_stats.active` to return an estimate of the current active
+      bytes, which should not be compared with other stats retrieved.
+      (@guangli-dai: 0988583d)
+
+Bug fixes:
+  - Prevent potential deadlocks in decaying during reentrancy.  (@interwq:
+    434a68e2)
+  - Fix segfault in extent coalescing.  (@Svetlitski: 12311fe6)
+  - Add null pointer detections in mallctl calls.  (@Svetlitski: dc0a184f,
+    0288126d)
+  - Make mallctl `arenas.lookup` triable without crashing on invalid
+    pointers.  (@auxten: 019cccc2, 5bac3849)
+  - Demote sampled allocations for proper deallocations during
+    `arena_reset`.  (@Svetlitski: 62648c88)
+  - Fix jemalloc's `read(2)` and `write(2)`.  (@Svetlitski: d2c9ed3d, @lexprfuncall:
+    9fdc1160)
+  - Fix the pkg-config metadata file.  (@BtbN: ed7e6fe7, ce8ce99a)
+  - Fix the autogen.sh so that it accepts quoted extra options.
+    (@honggyukim: f6fe6abd)
+  - Fix `rallocx()` to set errno to ENOMEM upon OOMing.  (@arter97: 38056fea,
+    @interwq: 83b07578)
+  - Avoid stack overflow for internal variable array usage.  (@nullptr0-0:
+    47c9bcd4, 48f66cf4, @xinydev: 9169e927)
+  - Fix background thread initialization race.  (@puzpuzpuz: 4d0ffa07)
+  - Guard os_page_id against a NULL address.  (@lexprfuncall: 79cc7dcc)
+  - Handle tcache init failures gracefully.  (@lexprfuncall: a056c20d)
+  - Fix missing release of acquired neighbor edata in
+    extent_try_coalesce_impl.  (@spredolac: 675ab079)
+  - Fix memory leak of old curr_reg on san_bump_grow_locked failure.
+    (@spredolac: 5904a421)
+  - Fix large alloc nrequests under-counting on cache misses.  (@spredolac:
+    3cc56d32)
+
+Portability improvements:
+  - Fix the build in C99.  (@abaelhe: 56ddbea2)
+  - Add `pthread_setaffinity_np` detection for non Linux/BSD platforms.
+    (@devnexen: 4c95c953)
+  - Make `VARIABLE_ARRAY` compatible with compilers not supporting VLA,
+    i.e., Visual Studio C compiler in C11 or C17 modes.  (@madscientist:
+    be65438f)
+  - Fix the build on Linux using musl library.  (@marv: aba1645f, 45249cf5)
+  - Reduce the memory overhead in small allocation sampling for systems
+    with larger page sizes, e.g., ARM.  (@Svetlitski: 5a858c64)
+  - Add C23's `free_sized` and `free_aligned_sized`.  (@Svetlitski:
+    cdb2c0e0)
+  - Enable heap profiling on MacOS.  (@nullptr0-0: 4b555c11)
+  - Fix incorrect printing on 32bit.  (@sundb: 630434bb)
+  - Make `JEMALLOC_CXX_THROW` compatible with C++ versions newer than
+    C++17.  (@r-barnes, @guangli-dai: 21bcc0a8)
+  - Fix mmap tag conflicts on MacOS.  (@kdrag0n: c893fcd1)
+  - Fix monotonic timer assumption for win32.  (@burtonli: 8dc97b11)
+  - Fix VM over-reservation on systems with larger pages, e.g., aarch64.
+    (@interwq: cd05b19f)
+  - Remove `unreachable()` macro conditionally to prevent definition
+    conflicts for C23+.  (@appujee: d8486b26, 4b88bddb)
+  - Fix dlsym failure observed on FreeBSD.  (@rhelmot: 86bbabac)
+  - Change the default page size to 64KB on aarch64 Linux.  (@lexprfuncall:
+    9442300c)
+  - Update config.guess and config.sub to the latest version.
+    (@lexprfuncall: c51949ea)
+  - Determine the page size on Android from NDK header files.
+    (@lexprfuncall: c51abba1)
+  - Improve the portability of grep patterns in configure.ac.
+    (@lexprfuncall: 365747bc)
+  - Add compile-time option `--with-cxx-stdlib` to specify the C++ standard
+    library.  (@yuxuanchen1997: a10ef3e1)
+
+Optimizations and refactors:
+  - Enable tcache for deallocation-only threads.  (@interwq: 143e9c4a)
+  - Inline to accelerate operator delete.  (@guangli-dai: e8f9f138)
+  - Optimize pairing heap's performance.  (@deadalnix: 5266152d, be6da4f6,
+    543e2d61, 10d71315, 92aa52c0, @Svetlitski: 36ca0c1b)
+  - Inline the storage for thread name in the profiling data.  (@interwq:
+    ce0b7ab6, e62aa478)
+  - Optimize a hot function `edata_cmp_summary_comp` to accelerate it.
+    (@Svetlitski: 6841110b, @guangli-dai: 0181aaa4)
+  - Allocate thread cache using the base allocator, which enables thread
+    cache to use thp when `metadata_thp` is turned on.  (@interwq:
+    72cfdce7)
+  - Allow oversize arena not to purge immediately when background threads
+    are enabled, although the default decay time is 0 to be back compatible.
+    (@interwq: d1313313)
+  - Optimize thread-local storage implementation on Windows.  (@mcfi:
+    9e123a83, 3a0d9cda)
+  - Optimize fast path to allow static size class computation.  (@interwq:
+    323ed2e3)
+  - Redesign tcache GC to regulate the frequency and make it
+    locality-aware. The new design is default on, guarded by option
+    `experimental_tcache_gc`.  (@nullptr0-0: 0c88be9e, e2c9f3a9,
+    14d5dc13, @deadalnix: 5afff2e4)
+  - Reduce the arena switching overhead by avoiding forced purging when
+    background thread is enabled.  (@interwq: a3910b98)
+  - Improve the reuse efficiency by limiting the maximum coalesced size for
+    large extents.  (@jiebinn: 3c14707b)
+  - Refactor thread events to allow registration of users' thread events
+    and remove prof_threshold as the built-in event.  (@spredolac: e6864c60,
+    015b0179, 34ace916)
+
+Documentation:
+  - Update Windows building instructions.  (@Lapenkov: 37139328)
+  - Add vcpkg installation instructions.  (@LilyWangLL: c0c9783e)
+  - Update profiling internals with an example.  (@jordalgo: b04e7666)
+
+* 5.3.0 (May 6, 2022)
+
+  This release contains many speed and space optimizations, from micro
+  optimizations on common paths to rework of internal data structures and
+  locking schemes, and many more too detailed to list below.  Multiple percent
+  of system level metric improvements were measured in tested production
+  workloads.  The release has gone through large-scale production testing.
+
+  New features:
+  - Add the thread.idle mallctl which hints that the calling thread will be
+    idle for a nontrivial period of time.  (@davidtgoldblatt)
+  - Allow small size classes to be the maximum size class to cache in the
+    thread-specific cache, through the opt.[lg_]tcache_max option.  (@interwq,
+    @jordalgo)
+  - Make the behavior of realloc(ptr, 0) configurable with opt.zero_realloc.
+    (@davidtgoldblatt)
+  - Add 'make uninstall' support.  (@sangshuduo, @Lapenkov)
+  - Support C++17 over-aligned allocation.  (@marksantaniello)
+  - Add the thread.peak mallctl for approximate per-thread peak memory tracking.
+    (@davidtgoldblatt)
+  - Add interval-based stats output opt.stats_interval.  (@interwq)
+  - Add prof.prefix to override filename prefixes for dumps.  (@zhxchen17)
+  - Add high resolution timestamp support for profiling.  (@tyroguru)
+  - Add the --collapsed flag to jeprof for flamegraph generation.
+    (@igorwwwwwwwwwwwwwwwwwwww)
+  - Add the --debug-syms-by-id option to jeprof for debug symbols discovery.
+    (@DeannaGelbart)
+  - Add the opt.prof_leak_error option to exit with error code when leak is
+    detected using opt.prof_final.  (@yunxuo)
+  - Add opt.cache_oblivious as an runtime alternative to config.cache_oblivious.
+    (@interwq)
+  - Add mallctl interfaces:
+    + opt.zero_realloc  (@davidtgoldblatt)
+    + opt.cache_oblivious  (@interwq)
+    + opt.prof_leak_error  (@yunxuo)
+    + opt.stats_interval  (@interwq)
+    + opt.stats_interval_opts  (@interwq)
+    + opt.tcache_max  (@interwq)
+    + opt.trust_madvise  (@azat)
+    + prof.prefix  (@zhxchen17)
+    + stats.zero_reallocs  (@davidtgoldblatt)
+    + thread.idle  (@davidtgoldblatt)
+    + thread.peak.{read,reset}  (@davidtgoldblatt)
+
+  Bug fixes:
+  - Fix the synchronization around explicit tcache creation which could cause
+    invalid tcache identifiers.  This regression was first released in 5.0.0.
+    (@yoshinorim, @davidtgoldblatt)
+  - Fix a profiling biasing issue which could cause incorrect heap usage and
+    object counts.  This issue existed in all previous releases with the heap
+    profiling feature.  (@davidtgoldblatt)
+  - Fix the order of stats counter updating on large realloc which could cause
+    failed assertions.  This regression was first released in 5.0.0.  (@azat)
+  - Fix the locking on the arena destroy mallctl, which could cause concurrent
+    arena creations to fail.  This functionality was first introduced in 5.0.0.
+    (@interwq)
+
+  Portability improvements:
+  - Remove nothrow from system function declarations on macOS and FreeBSD.
+    (@davidtgoldblatt, @fredemmott, @leres)
+  - Improve overcommit and page alignment settings on NetBSD.  (@zoulasc)
+  - Improve CPU affinity support on BSD platforms.  (@devnexen)
+  - Improve utrace detection and support.  (@devnexen)
+  - Improve QEMU support with MADV_DONTNEED zeroed pages detection.  (@azat)
+  - Add memcntl support on Solaris / illumos.  (@devnexen)
+  - Improve CPU_SPINWAIT on ARM.  (@AWSjswinney)
+  - Improve TSD cleanup on FreeBSD.  (@Lapenkov)
+  - Disable percpu_arena if the CPU count cannot be reliably detected.  (@azat)
+  - Add malloc_size(3) override support.  (@devnexen)
+  - Add mmap VM_MAKE_TAG support.  (@devnexen)
+  - Add support for MADV_[NO]CORE.  (@devnexen)
+  - Add support for DragonFlyBSD.  (@devnexen)
+  - Fix the QUANTUM setting on MIPS64.  (@brooksdavis)
+  - Add the QUANTUM setting for ARC.  (@vineetgarc)
+  - Add the QUANTUM setting for LoongArch.  (@wangjl-uos)
+  - Add QNX support.  (@jqian-aurora)
+  - Avoid atexit(3) calls unless the relevant profiling features are enabled.
+    (@BusyJay, @laiwei-rice, @interwq)
+  - Fix unknown option detection when using Clang.  (@Lapenkov)
+  - Fix symbol conflict with musl libc.  (@georgthegreat)
+  - Add -Wimplicit-fallthrough checks.  (@nickdesaulniers)
+  - Add __forceinline support on MSVC.  (@santagada)
+  - Improve FreeBSD and Windows CI support.  (@Lapenkov)
+  - Add CI support for PPC64LE architecture.  (@ezeeyahoo)
+
+  Incompatible changes:
+  - Maximum size class allowed in tcache (opt.[lg_]tcache_max) now has an upper
+    bound of 8MiB.  (@interwq)
+
+  Optimizations and refactors (@davidtgoldblatt, @Lapenkov, @interwq):
+  - Optimize the common cases of the thread cache operations.
+  - Optimize internal data structures, including RB tree and pairing heap.
+  - Optimize the internal locking on extent management.
+  - Extract and refactor the internal page allocator and interface modules.
+
+  Documentation:
+  - Fix doc build with --with-install-suffix.  (@lawmurray, @interwq)
+  - Add PROFILING_INTERNALS.md.  (@davidtgoldblatt)
+  - Ensure the proper order of doc building and installation.  (@Mingli-Yu)
+
+* 5.2.1 (August 5, 2019)
+
+  This release is primarily about Windows.  A critical virtual memory leak is
+  resolved on all Windows platforms.  The regression was present in all releases
+  since 5.0.0.
+
+  Bug fixes:
+  - Fix a severe virtual memory leak on Windows.  This regression was first
+    released in 5.0.0.  (@Ignition, @j0t, @frederik-h, @davidtgoldblatt,
+    @interwq)
+  - Fix size 0 handling in posix_memalign().  This regression was first released
+    in 5.2.0.  (@interwq)
+  - Fix the prof_log unit test which may observe unexpected backtraces from
+    compiler optimizations.  The test was first added in 5.2.0.  (@marxin,
+    @gnzlbg, @interwq)
+  - Fix the declaration of the extent_avail tree.  This regression was first
+    released in 5.1.0.  (@zoulasc)
+  - Fix an incorrect reference in jeprof.  This functionality was first released
+    in 3.0.0.  (@prehistoric-penguin)
+  - Fix an assertion on the deallocation fast-path.  This regression was first
+    released in 5.2.0.  (@yinan1048576)
+  - Fix the TLS_MODEL attribute in headers.  This regression was first released
+    in 5.0.0.  (@zoulasc, @interwq)
+
+  Optimizations and refactors:
+  - Implement opt.retain on Windows and enable by default on 64-bit.  (@interwq,
+    @davidtgoldblatt)
+  - Optimize away a branch on the operator delete[] path.  (@mgrice)
+  - Add format annotation to the format generator function.  (@zoulasc)
+  - Refactor and improve the size class header generation.  (@yinan1048576)
+  - Remove best fit.  (@djwatson)
+  - Avoid blocking on background thread locks for stats.  (@oranagra, @interwq)
+
+* 5.2.0 (April 2, 2019)
+
+  This release includes a few notable improvements, which are summarized below:
+  1) improved fast-path performance from the optimizations by @djwatson; 2)
+  reduced virtual memory fragmentation and metadata usage; and 3) bug fixes on
+  setting the number of background threads.  In addition, peak / spike memory
+  usage is improved with certain allocation patterns.  As usual, the release and
+  prior dev versions have gone through large-scale production testing.
+
+  New features:
+  - Implement oversize_threshold, which uses a dedicated arena for allocations
+    crossing the specified threshold to reduce fragmentation.  (@interwq)
+  - Add extents usage information to stats.  (@tyleretzel)
+  - Log time information for sampled allocations.  (@tyleretzel)
+  - Support 0 size in sdallocx.  (@djwatson)
+  - Output rate for certain counters in malloc_stats.  (@zinoale)
+  - Add configure option --enable-readlinkat, which allows the use of readlinkat
+    over readlink.  (@davidtgoldblatt)
+  - Add configure options --{enable,disable}-{static,shared} to allow not
+    building unwanted libraries.  (@Ericson2314)
+  - Add configure option --disable-libdl to enable fully static builds.
+    (@interwq)
+  - Add mallctl interfaces:
+	+ opt.oversize_threshold (@interwq)
+	+ stats.arenas.<i>.extent_avail (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.n{dirty,muzzy,retained} (@tyleretzel)
+	+ stats.arenas.<i>.extents.<j>.{dirty,muzzy,retained}_bytes
+	  (@tyleretzel)
+
+  Portability improvements:
+  - Update MSVC builds.  (@maksqwe, @rustyx)
+  - Workaround a compiler optimizer bug on s390x.  (@rkmisra)
+  - Make use of pthread_set_name_np(3) on FreeBSD.  (@trasz)
+  - Implement malloc_getcpu() to enable percpu_arena for windows.  (@santagada)
+  - Link against -pthread instead of -lpthread.  (@paravoid)
+  - Make background_thread not dependent on libdl.  (@interwq)
+  - Add stringify to fix a linker directive issue on MSVC.  (@daverigby)
+  - Detect and fall back when 8-bit atomics are unavailable.  (@interwq)
+  - Fall back to the default pthread_create if dlsym(3) fails.  (@interwq)
+
+  Optimizations and refactors:
+  - Refactor the TSD module.  (@davidtgoldblatt)
+  - Avoid taking extents_muzzy mutex when muzzy is disabled.  (@interwq)
+  - Avoid taking large_mtx for auto arenas on the tcache flush path.  (@interwq)
+  - Optimize ixalloc by avoiding a size lookup.  (@interwq)
+  - Implement opt.oversize_threshold which uses a dedicated arena for requests
+    crossing the threshold, also eagerly purges the oversize extents.  Default
+    the threshold to 8 MiB.  (@interwq)
+  - Clean compilation with -Wextra.  (@gnzlbg, @jasone)
+  - Refactor the size class module.  (@davidtgoldblatt)
+  - Refactor the stats emitter.  (@tyleretzel)
+  - Optimize pow2_ceil.  (@rkmisra)
+  - Avoid runtime detection of lazy purging on FreeBSD.  (@trasz)
+  - Optimize mmap(2) alignment handling on FreeBSD.  (@trasz)
+  - Improve error handling for THP state initialization.  (@jsteemann)
+  - Rework the malloc() fast path.  (@djwatson)
+  - Rework the free() fast path.  (@djwatson)
+  - Refactor and optimize the tcache fill / flush paths.  (@djwatson)
+  - Optimize sync / lwsync on PowerPC.  (@chmeeedalf)
+  - Bypass extent_dalloc() when retain is enabled.  (@interwq)
+  - Optimize the locking on large deallocation.  (@interwq)
+  - Reduce the number of pages committed from sanity checking in debug build.
+    (@trasz, @interwq)
+  - Deprecate OSSpinLock.  (@interwq)
+  - Lower the default number of background threads to 4 (when the feature
+    is enabled).  (@interwq)
+  - Optimize the trylock spin wait.  (@djwatson)
+  - Use arena index for arena-matching checks.  (@interwq)
+  - Avoid forced decay on thread termination when using background threads.
+    (@interwq)
+  - Disable muzzy decay by default.  (@djwatson, @interwq)
+  - Only initialize libgcc unwinder when profiling is enabled.  (@paravoid,
+    @interwq)
+
+  Bug fixes (all only relevant to jemalloc 5.x):
+  - Fix background thread index issues with max_background_threads.  (@djwatson,
+    @interwq)
+  - Fix stats output for opt.lg_extent_max_active_fit.  (@interwq)
+  - Fix opt.prof_prefix initialization.  (@davidtgoldblatt)
+  - Properly trigger decay on tcache destroy.  (@interwq, @amosbird)
+  - Fix tcache.flush.  (@interwq)
+  - Detect whether explicit extent zero out is necessary with huge pages or
+    custom extent hooks, which may change the purge semantics.  (@interwq)
+  - Fix a side effect caused by extent_max_active_fit combined with decay-based
+    purging, where freed extents can accumulate and not be reused for an
+    extended period of time.  (@interwq, @mpghf)
+  - Fix a missing unlock on extent register error handling.  (@zoulasc)
+
+  Testing:
+  - Simplify the Travis script output.  (@gnzlbg)
+  - Update the test scripts for FreeBSD.  (@devnexen)
+  - Add unit tests for the producer-consumer pattern.  (@interwq)
+  - Add Cirrus-CI config for FreeBSD builds.  (@jasone)
+  - Add size-matching sanity checks on tcache flush.  (@davidtgoldblatt,
+    @interwq)
+
+  Incompatible changes:
+  - Remove --with-lg-page-sizes.  (@davidtgoldblatt)
+
+  Documentation:
+  - Attempt to build docs by default, however skip doc building when xsltproc
+    is missing. (@interwq, @cmuellner)
+
+* 5.1.0 (May 4, 2018)
+
+  This release is primarily about fine-tuning, ranging from several new features
+  to numerous notable performance and portability enhancements.  The release and
+  prior dev versions have been running in multiple large scale applications for
+  months, and the cumulative improvements are substantial in many cases.
+
+  Given the long and successful production runs, this release is likely a good
+  candidate for applications to upgrade, from both jemalloc 5.0 and before.  For
+  performance-critical applications, the newly added TUNING.md provides
+  guidelines on jemalloc tuning.
+
+  New features:
+  - Implement transparent huge page support for internal metadata.  (@interwq)
+  - Add opt.thp to allow enabling / disabling transparent huge pages for all
+    mappings.  (@interwq)
+  - Add maximum background thread count option.  (@djwatson)
+  - Allow prof_active to control opt.lg_prof_interval and prof.gdump.
+    (@interwq)
+  - Allow arena index lookup based on allocation addresses via mallctl.
+    (@lionkov)
+  - Allow disabling initial-exec TLS model.  (@davidtgoldblatt, @KenMacD)
+  - Add opt.lg_extent_max_active_fit to set the max ratio between the size of
+    the active extent selected (to split off from) and the size of the requested
+    allocation.  (@interwq, @davidtgoldblatt)
+  - Add retain_grow_limit to set the max size when growing virtual address
+    space.  (@interwq)
+  - Add mallctl interfaces:
+    + arena.<i>.retain_grow_limit  (@interwq)
+    + arenas.lookup  (@lionkov)
+    + max_background_threads  (@djwatson)
+    + opt.lg_extent_max_active_fit  (@interwq)
+    + opt.max_background_threads  (@djwatson)
+    + opt.metadata_thp  (@interwq)
+    + opt.thp  (@interwq)
+    + stats.metadata_thp  (@interwq)
+
+  Portability improvements:
+  - Support GNU/kFreeBSD configuration.  (@paravoid)
+  - Support m68k, nios2 and SH3 architectures.  (@paravoid)
+  - Fall back to FD_CLOEXEC when O_CLOEXEC is unavailable.  (@zonyitoo)
+  - Fix symbol listing for cross-compiling.  (@tamird)
+  - Fix high bits computation on ARM.  (@davidtgoldblatt, @paravoid)
+  - Disable the CPU_SPINWAIT macro for Power.  (@davidtgoldblatt, @marxin)
+  - Fix MSVC 2015 & 2017 builds.  (@rustyx)
+  - Improve RISC-V support.  (@EdSchouten)
+  - Set name mangling script in strict mode.  (@nicolov)
+  - Avoid MADV_HUGEPAGE on ARM.  (@marxin)
+  - Modify configure to determine return value of strerror_r.
+    (@davidtgoldblatt, @cferris1000)
+  - Make sure CXXFLAGS is tested with CPP compiler.  (@nehaljwani)
+  - Fix 32-bit build on MSVC.  (@rustyx)
+  - Fix external symbol on MSVC.  (@maksqwe)
+  - Avoid a printf format specifier warning.  (@jasone)
+  - Add configure option --disable-initial-exec-tls which can allow jemalloc to
+    be dynamically loaded after program startup.  (@davidtgoldblatt, @KenMacD)
+  - AArch64: Add ILP32 support.  (@cmuellner)
+  - Add --with-lg-vaddr configure option to support cross compiling.
+    (@cmuellner, @davidtgoldblatt)
+
+  Optimizations and refactors:
+  - Improve active extent fit with extent_max_active_fit.  This considerably
+    reduces fragmentation over time and improves virtual memory and metadata
+    usage.  (@davidtgoldblatt, @interwq)
+  - Eagerly coalesce large extents to reduce fragmentation.  (@interwq)
+  - sdallocx: only read size info when page aligned (i.e. possibly sampled),
+    which speeds up the sized deallocation path significantly.  (@interwq)
+  - Avoid attempting new mappings for in place expansion with retain, since
+    it rarely succeeds in practice and causes high overhead.  (@interwq)
+  - Refactor OOM handling in newImpl.  (@wqfish)
+  - Add internal fine-grained logging functionality for debugging use.
+    (@davidtgoldblatt)
+  - Refactor arena / tcache interactions.  (@davidtgoldblatt)
+  - Refactor extent management with dumpable flag.  (@davidtgoldblatt)
+  - Add runtime detection of lazy purging.  (@interwq)
+  - Use pairing heap instead of red-black tree for extents_avail.  (@djwatson)
+  - Use sysctl on startup in FreeBSD.  (@trasz)
+  - Use thread local prng state instead of atomic.  (@djwatson)
+  - Make decay to always purge one more extent than before, because in
+    practice large extents are usually the ones that cross the decay threshold.
+    Purging the additional extent helps save memory as well as reduce VM
+    fragmentation.  (@interwq)
+  - Fast division by dynamic values.  (@davidtgoldblatt)
+  - Improve the fit for aligned allocation.  (@interwq, @edwinsmith)
+  - Refactor extent_t bitpacking.  (@rkmisra)
+  - Optimize the generated assembly for ticker operations.  (@davidtgoldblatt)
+  - Convert stats printing to use a structured text emitter.  (@davidtgoldblatt)
+  - Remove preserve_lru feature for extents management.  (@djwatson)
+  - Consolidate two memory loads into one on the fast deallocation path.
+    (@davidtgoldblatt, @interwq)
+
+  Bug fixes (most of the issues are only relevant to jemalloc 5.0):
+  - Fix deadlock with multithreaded fork in OS X.  (@davidtgoldblatt)
+  - Validate returned file descriptor before use.  (@zonyitoo)
+  - Fix a few background thread initialization and shutdown issues.  (@interwq)
+  - Fix an extent coalesce + decay race by taking both coalescing extents off
+    the LRU list.  (@interwq)
+  - Fix potentially unbound increase during decay, caused by one thread keep
+    stashing memory to purge while other threads generating new pages.  The
+    number of pages to purge is checked to prevent this.  (@interwq)
+  - Fix a FreeBSD bootstrap assertion.  (@strejda, @interwq)
+  - Handle 32 bit mutex counters.  (@rkmisra)
+  - Fix a indexing bug when creating background threads.  (@davidtgoldblatt,
+    @binliu19)
+  - Fix arguments passed to extent_init.  (@yuleniwo, @interwq)
+  - Fix addresses used for ordering mutexes.  (@rkmisra)
+  - Fix abort_conf processing during bootstrap.  (@interwq)
+  - Fix include path order for out-of-tree builds.  (@cmuellner)
+
+  Incompatible changes:
+  - Remove --disable-thp.  (@interwq)
+  - Remove mallctl interfaces:
+    + config.thp  (@interwq)
+
+  Documentation:
+  - Add TUNING.md.  (@interwq, @davidtgoldblatt, @djwatson)
+
+* 5.0.1 (July 1, 2017)
+
+  This bugfix release fixes several issues, most of which are obscure enough
+  that typical applications are not impacted.
+
+  Bug fixes:
+  - Update decay->nunpurged before purging, in order to avoid potential update
+    races and subsequent incorrect purging volume.  (@interwq)
+  - Only abort on dlsym(3) error if the failure impacts an enabled feature (lazy
+    locking and/or background threads).  This mitigates an initialization
+    failure bug for which we still do not have a clear reproduction test case.
+    (@interwq)
+  - Modify tsd management so that it neither crashes nor leaks if a thread's
+    only allocation activity is to call free() after TLS destructors have been
+    executed.  This behavior was observed when operating with GNU libc, and is
+    unlikely to be an issue with other libc implementations.  (@interwq)
+  - Mask signals during background thread creation.  This prevents signals from
+    being inadvertently delivered to background threads.  (@jasone,
+    @davidtgoldblatt, @interwq)
+  - Avoid inactivity checks within background threads, in order to prevent
+    recursive mutex acquisition.  (@interwq)
+  - Fix extent_grow_retained() to use the specified hooks when the
+    arena.<i>.extent_hooks mallctl is used to override the default hooks.
+    (@interwq)
+  - Add missing reentrancy support for custom extent hooks which allocate.
+    (@interwq)
+  - Post-fork(2), re-initialize the list of tcaches associated with each arena
+    to contain no tcaches except the forking thread's.  (@interwq)
+  - Add missing post-fork(2) mutex reinitialization for extent_grow_mtx.  This
+    fixes potential deadlocks after fork(2).  (@interwq)
+  - Enforce minimum autoconf version (currently 2.68), since 2.63 is known to
+    generate corrupt configure scripts.  (@jasone)
+  - Ensure that the configured page size (--with-lg-page) is no larger than the
+    configured huge page size (--with-lg-hugepage).  (@jasone)
+
+* 5.0.0 (June 13, 2017)
+
+  Unlike all previous jemalloc releases, this release does not use naturally
+  aligned "chunks" for virtual memory management, and instead uses page-aligned
+  "extents".  This change has few externally visible effects, but the internal
+  impacts are... extensive.  Many other internal changes combine to make this
+  the most cohesively designed version of jemalloc so far, with ample
+  opportunity for further enhancements.
+
+  Continuous integration is now an integral aspect of development thanks to the
+  efforts of @davidtgoldblatt, and the dev branch tends to remain reasonably
+  stable on the tested platforms (Linux, FreeBSD, macOS, and Windows).  As a
+  side effect the official release frequency may decrease over time.
+
+  New features:
+  - Implement optional per-CPU arena support; threads choose which arena to use
+    based on current CPU rather than on fixed thread-->arena associations.
+    (@interwq)
+  - Implement two-phase decay of unused dirty pages.  Pages transition from
+    dirty-->muzzy-->clean, where the first phase transition relies on
+    madvise(... MADV_FREE) semantics, and the second phase transition discards
+    pages such that they are replaced with demand-zeroed pages on next access.
+    (@jasone)
+  - Increase decay time resolution from seconds to milliseconds.  (@jasone)
+  - Implement opt-in per CPU background threads, and use them for asynchronous
+    decay-driven unused dirty page purging.  (@interwq)
+  - Add mutex profiling, which collects a variety of statistics useful for
+    diagnosing overhead/contention issues.  (@interwq)
+  - Add C++ new/delete operator bindings.  (@djwatson)
+  - Support manually created arena destruction, such that all data and metadata
+    are discarded.  Add MALLCTL_ARENAS_DESTROYED for accessing merged stats
+    associated with destroyed arenas.  (@jasone)
+  - Add MALLCTL_ARENAS_ALL as a fixed index for use in accessing
+    merged/destroyed arena statistics via mallctl.  (@jasone)
+  - Add opt.abort_conf to optionally abort if invalid configuration options are
+    detected during initialization.  (@interwq)
+  - Add opt.stats_print_opts, so that e.g. JSON output can be selected for the
+    stats dumped during exit if opt.stats_print is true.  (@jasone)
+  - Add --with-version=VERSION for use when embedding jemalloc into another
+    project's git repository.  (@jasone)
+  - Add --disable-thp to support cross compiling.  (@jasone)
+  - Add --with-lg-hugepage to support cross compiling.  (@jasone)
+  - Add mallctl interfaces (various authors):
+    + background_thread
+    + opt.abort_conf
+    + opt.retain
+    + opt.percpu_arena
+    + opt.background_thread
+    + opt.{dirty,muzzy}_decay_ms
+    + opt.stats_print_opts
+    + arena.<i>.initialized
+    + arena.<i>.destroy
+    + arena.<i>.{dirty,muzzy}_decay_ms
+    + arena.<i>.extent_hooks
+    + arenas.{dirty,muzzy}_decay_ms
+    + arenas.bin.<i>.slab_size
+    + arenas.nlextents
+    + arenas.lextent.<i>.size
+    + arenas.create
+    + stats.background_thread.{num_threads,num_runs,run_interval}
+    + stats.mutexes.{ctl,background_thread,prof,reset}.
+      {num_ops,num_spin_acq,num_wait,max_wait_time,total_wait_time,max_num_thds,
+      num_owner_switch}
+    + stats.arenas.<i>.{dirty,muzzy}_decay_ms
+    + stats.arenas.<i>.uptime
+    + stats.arenas.<i>.{pmuzzy,base,internal,resident}
+    + stats.arenas.<i>.{dirty,muzzy}_{npurge,nmadvise,purged}
+    + stats.arenas.<i>.bins.<j>.{nslabs,reslabs,curslabs}
+    + stats.arenas.<i>.bins.<j>.mutex.
+      {num_ops,num_spin_acq,num_wait,max_wait_time,total_wait_time,max_num_thds,
+      num_owner_switch}
+    + stats.arenas.<i>.lextents.<j>.{nmalloc,ndalloc,nrequests,curlextents}
+    + stats.arenas.i.mutexes.{large,extent_avail,extents_dirty,extents_muzzy,
+      extents_retained,decay_dirty,decay_muzzy,base,tcache_list}.
+      {num_ops,num_spin_acq,num_wait,max_wait_time,total_wait_time,max_num_thds,
+      num_owner_switch}
+
+  Portability improvements:
+  - Improve reentrant allocation support, such that deadlock is less likely if
+    e.g. a system library call in turn allocates memory.  (@davidtgoldblatt,
+    @interwq)
+  - Support static linking of jemalloc with glibc.  (@djwatson)
+
+  Optimizations and refactors:
+  - Organize virtual memory as "extents" of virtual memory pages, rather than as
+    naturally aligned "chunks", and store all metadata in arbitrarily distant
+    locations.  This reduces virtual memory external fragmentation, and will
+    interact better with huge pages (not yet explicitly supported).  (@jasone)
+  - Fold large and huge size classes together; only small and large size classes
+    remain.  (@jasone)
+  - Unify the allocation paths, and merge most fast-path branching decisions.
+    (@davidtgoldblatt, @interwq)
+  - Embed per thread automatic tcache into thread-specific data, which reduces
+    conditional branches and dereferences.  Also reorganize tcache to increase
+    fast-path data locality.  (@interwq)
+  - Rewrite atomics to closely model the C11 API, convert various
+    synchronization from mutex-based to atomic, and use the explicit memory
+    ordering control to resolve various hypothetical races without increasing
+    synchronization overhead.  (@davidtgoldblatt)
+  - Extensively optimize rtree via various methods:
+    + Add multiple layers of rtree lookup caching, since rtree lookups are now
+      part of fast-path deallocation.  (@interwq)
+    + Determine rtree layout at compile time.  (@jasone)
+    + Make the tree shallower for common configurations.  (@jasone)
+    + Embed the root node in the top-level rtree data structure, thus avoiding
+      one level of indirection.  (@jasone)
+    + Further specialize leaf elements as compared to internal node elements,
+      and directly embed extent metadata needed for fast-path deallocation.
+      (@jasone)
+    + Ignore leading always-zero address bits (architecture-specific).
+      (@jasone)
+  - Reorganize headers (ongoing work) to make them hermetic, and disentangle
+    various module dependencies.  (@davidtgoldblatt)
+  - Convert various internal data structures such as size class metadata from
+    boot-time-initialized to compile-time-initialized.  Propagate resulting data
+    structure simplifications, such as making arena metadata fixed-size.
+    (@jasone)
+  - Simplify size class lookups when constrained to size classes that are
+    multiples of the page size.  This speeds lookups, but the primary benefit is
+    complexity reduction in code that was the source of numerous regressions.
+    (@jasone)
+  - Lock individual extents when possible for localized extent operations,
+    rather than relying on a top-level arena lock.  (@davidtgoldblatt, @jasone)
+  - Use first fit layout policy instead of best fit, in order to improve
+    packing.  (@jasone)
+  - If munmap(2) is not in use, use an exponential series to grow each arena's
+    virtual memory, so that the number of disjoint virtual memory mappings
+    remains low.  (@jasone)
+  - Implement per arena base allocators, so that arenas never share any virtual
+    memory pages.  (@jasone)
+  - Automatically generate private symbol name mangling macros.  (@jasone)
+
+  Incompatible changes:
+  - Replace chunk hooks with an expanded/normalized set of extent hooks.
+    (@jasone)
+  - Remove ratio-based purging.  (@jasone)
+  - Remove --disable-tcache.  (@jasone)
+  - Remove --disable-tls.  (@jasone)
+  - Remove --enable-ivsalloc.  (@jasone)
+  - Remove --with-lg-size-class-group.  (@jasone)
+  - Remove --with-lg-tiny-min.  (@jasone)
+  - Remove --disable-cc-silence.  (@jasone)
+  - Remove --enable-code-coverage.  (@jasone)
+  - Remove --disable-munmap (replaced by opt.retain).  (@jasone)
+  - Remove Valgrind support.  (@jasone)
+  - Remove quarantine support.  (@jasone)
+  - Remove redzone support.  (@jasone)
+  - Remove mallctl interfaces (various authors):
+    + config.munmap
+    + config.tcache
+    + config.tls
+    + config.valgrind
+    + opt.lg_chunk
+    + opt.purge
+    + opt.lg_dirty_mult
+    + opt.decay_time
+    + opt.quarantine
+    + opt.redzone
+    + opt.thp
+    + arena.<i>.lg_dirty_mult
+    + arena.<i>.decay_time
+    + arena.<i>.chunk_hooks
+    + arenas.initialized
+    + arenas.lg_dirty_mult
+    + arenas.decay_time
+    + arenas.bin.<i>.run_size
+    + arenas.nlruns
+    + arenas.lrun.<i>.size
+    + arenas.nhchunks
+    + arenas.hchunk.<i>.size
+    + arenas.extend
+    + stats.cactive
+    + stats.arenas.<i>.lg_dirty_mult
+    + stats.arenas.<i>.decay_time
+    + stats.arenas.<i>.metadata.{mapped,allocated}
+    + stats.arenas.<i>.{npurge,nmadvise,purged}
+    + stats.arenas.<i>.huge.{allocated,nmalloc,ndalloc,nrequests}
+    + stats.arenas.<i>.bins.<j>.{nruns,reruns,curruns}
+    + stats.arenas.<i>.lruns.<j>.{nmalloc,ndalloc,nrequests,curruns}
+    + stats.arenas.<i>.hchunks.<j>.{nmalloc,ndalloc,nrequests,curhchunks}
+
+  Bug fixes:
+  - Improve interval-based profile dump triggering to dump only one profile when
+    a single allocation's size exceeds the interval.  (@jasone)
+  - Use prefixed function names (as controlled by --with-jemalloc-prefix) when
+    pruning backtrace frames in jeprof.  (@jasone)
+
+* 4.5.0 (February 28, 2017)
+
+  This is the first release to benefit from much broader continuous integration
+  testing, thanks to @davidtgoldblatt.  Had we had this testing infrastructure
+  in place for prior releases, it would have caught all of the most serious
+  regressions fixed by this release.
+
+  New features:
+  - Add --disable-thp and the opt.thp mallctl to provide opt-out mechanisms for
+    transparent huge page integration.  (@jasone)
+  - Update zone allocator integration to work with macOS 10.12.  (@glandium)
+  - Restructure *CFLAGS configuration, so that CFLAGS behaves typically, and
+    EXTRA_CFLAGS provides a way to specify e.g. -Werror during building, but not
+    during configuration.  (@jasone, @ronawho)
+
+  Bug fixes:
+  - Fix DSS (sbrk(2)-based) allocation.  This regression was first released in
+    4.3.0.  (@jasone)
+  - Handle race in per size class utilization computation.  This functionality
+    was first released in 4.0.0.  (@interwq)
+  - Fix lock order reversal during gdump.  (@jasone)
+  - Fix/refactor tcache synchronization.  This regression was first released in
+    4.0.0.  (@jasone)
+  - Fix various JSON-formatted malloc_stats_print() bugs.  This functionality
+    was first released in 4.3.0.  (@jasone)
+  - Fix huge-aligned allocation.  This regression was first released in 4.4.0.
+    (@jasone)
+  - When transparent huge page integration is enabled, detect what state pages
+    start in according to the kernel's current operating mode, and only convert
+    arena chunks to non-huge during purging if that is not their initial state.
+    This functionality was first released in 4.4.0.  (@jasone)
+  - Fix lg_chunk clamping for the --enable-cache-oblivious --disable-fill case.
+    This regression was first released in 4.0.0.  (@jasone, @428desmo)
+  - Properly detect sparc64 when building for Linux.  (@glaubitz)
+
+* 4.4.0 (December 3, 2016)
+
+  New features:
+  - Add configure support for *-*-linux-android.  (@cferris1000, @jasone)
+  - Add the --disable-syscall configure option, for use on systems that place
+    security-motivated limitations on syscall(2).  (@jasone)
+  - Add support for Debian GNU/kFreeBSD.  (@thesam)
+
+  Optimizations:
+  - Add extent serial numbers and use them where appropriate as a sort key that
+    is higher priority than address, so that the allocation policy prefers older
+    extents.  This tends to improve locality (decrease fragmentation) when
+    memory grows downward.  (@jasone)
+  - Refactor madvise(2) configuration so that MADV_FREE is detected and utilized
+    on Linux 4.5 and newer.  (@jasone)
+  - Mark partially purged arena chunks as non-huge-page.  This improves
+    interaction with Linux's transparent huge page functionality.  (@jasone)
+
+  Bug fixes:
+  - Fix size class computations for edge conditions involving extremely large
+    allocations.  This regression was first released in 4.0.0.  (@jasone,
+    @ingvarha)
+  - Remove overly restrictive assertions related to the cactive statistic.  This
+    regression was first released in 4.1.0.  (@jasone)
+  - Implement a more reliable detection scheme for os_unfair_lock on macOS.
+    (@jszakmeister)
+
 * 4.3.1 (November 7, 2016)

  Bug fixes:
@ -231,7 +1016,7 @@ brevity.  Much more detail can be found in the git revision history:
  these fixes, xallocx() now tries harder to partially fulfill requests for
  optional extra space.  Note that a couple of minor heap profiling
  optimizations are included, but these are better thought of as performance
-  fixes that were integral to disovering most of the other bugs.
+  fixes that were integral to discovering most of the other bugs.

  Optimizations:
  - Avoid a chunk metadata read in arena_prof_tctx_set(), since it is in the
--- a/414
+++ b/414
@ -1,414 +0,0 @@
-Building and installing a packaged release of jemalloc can be as simple as
-typing the following while in the root directory of the source tree:
-
-    ./configure
-    make
-    make install
-
-If building from unpackaged developer sources, the simplest command sequence
-that might work is:
-
-    ./autogen.sh
-    make dist
-    make
-    make install
-
-Note that documentation is not built by the default target because doing so
-would create a dependency on xsltproc in packaged releases, hence the
-requirement to either run 'make dist' or avoid installing docs via the various
-install_* targets documented below.
-
-=== Advanced configuration =====================================================
-
-The 'configure' script supports numerous options that allow control of which
-functionality is enabled, where jemalloc is installed, etc.  Optionally, pass
-any of the following arguments (not a definitive list) to 'configure':
-
--help
-    Print a definitive list of options.
-
--prefix=<install-root-dir>
-    Set the base directory in which to install.  For example:
-
-        ./configure --prefix=/usr/local
-
-    will cause files to be installed into /usr/local/include, /usr/local/lib,
-    and /usr/local/man.
-
--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>
-    Use the specified version string rather than trying to generate one (if in
-    a git repository) or use existing the VERSION file (if present).
-
--with-rpath=<colon-separated-rpath>
-    Embed one or more library paths, so that libjemalloc can find the libraries
-    it is linked to.  This works only on ELF-based systems.
-
--with-mangling=<map>
-    Mangle public symbols specified in <map> which is a comma-separated list of
-    name:mangled pairs.
-
-    For example, to use ld's --wrap option as an alternative method for
-    overriding libc's malloc implementation, specify something like:
-
-      --with-mangling=malloc:__wrap_malloc,free:__wrap_free[...]
-
-    Note that mangling happens prior to application of the prefix specified by
-    --with-jemalloc-prefix, and mangled symbols are then ignored when applying
-    the prefix.
-
--with-jemalloc-prefix=<prefix>
-    Prefix all public APIs with <prefix>.  For example, if <prefix> is
-    "prefix_", API changes like the following occur:
-
-      malloc()         --> prefix_malloc()
-      malloc_conf      --> prefix_malloc_conf
-      /etc/malloc.conf --> /etc/prefix_malloc.conf
-      MALLOC_CONF      --> PREFIX_MALLOC_CONF
-
-    This makes it possible to use jemalloc at the same time as the system
-    allocator, or even to use multiple copies of jemalloc simultaneously.
-
-    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
-    jemalloc overlays the default malloc zone, but makes no attempt to actually
-    replace the "malloc", "calloc", etc. symbols.
-
--without-export
-    Don't export public APIs.  This can be useful when building jemalloc as a
-    static library, or to avoid exporting public APIs when using the zone
-    allocator on OSX.
-
--with-private-namespace=<prefix>
-    Prefix all library-private APIs with <prefix>je_.  For shared libraries,
-    symbol visibility mechanisms prevent these symbols from being exported, but
-    for static libraries, naming collisions are a real possibility.  By
-    default, <prefix> is empty, which results in a symbol prefix of je_ .
-
--with-install-suffix=<suffix>
-    Append <suffix> to the base name of all installed files, such that multiple
-    versions of jemalloc can coexist in the same installation directory.  For
-    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
-
--with-malloc-conf=<malloc_conf>
-    Embed <malloc_conf> as a run-time options string that is processed prior to
-    the malloc_conf global variable, the /etc/malloc.conf symlink, and the
-    MALLOC_CONF environment variable.  For example, to change the default chunk
-    size to 256 KiB:
-
-      --with-malloc-conf=lg_chunk:18
-
--disable-cc-silence
-    Disable code that silences non-useful compiler warnings.  This is mainly
-    useful during development when auditing the set of warnings that are being
-    silenced.
-
--enable-debug
-    Enable assertions and validation code.  This incurs a substantial
-    performance hit, but is very useful during application development.
-    Implies --enable-ivsalloc.
-
--enable-code-coverage
-    Enable code coverage support, for use during jemalloc test development.
-    Additional testing targets are available if this option is enabled:
-
-      coverage
-      coverage_unit
-      coverage_integration
-      coverage_stress
-
-    These targets do not clear code coverage results from previous runs, and
-    there are interactions between the various coverage targets, so it is
-    usually advisable to run 'make clean' between repeated code coverage runs.
-
--disable-stats
-    Disable statistics gathering functionality.  See the "opt.stats_print"
-    option documentation for usage details.
-
--enable-ivsalloc
-    Enable validation code, which verifies that pointers reside within
-    jemalloc-owned chunks before dereferencing them.  This incurs a minor
-    performance hit.
-
--enable-prof
-    Enable heap profiling and leak detection functionality.  See the "opt.prof"
-    option documentation for usage details.  When enabled, there are several
-    approaches to backtracing, and the configure script chooses the first one
-    in the following list that appears to function correctly:
-
-    + libunwind      (requires --enable-prof-libunwind)
-    + libgcc         (unless --disable-prof-libgcc)
-    + gcc intrinsics (unless --disable-prof-gcc)
-
--enable-prof-libunwind
-    Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
-    backtracing.
-
--disable-prof-libgcc
-    Disable the use of libgcc's backtracing functionality.
-
--disable-prof-gcc
-    Disable the use of gcc intrinsics for backtracing.
-
--with-static-libunwind=<libunwind.a>
-    Statically link against the specified libunwind.a rather than dynamically
-    linking with -lunwind.
-
--disable-tcache
-    Disable thread-specific caches for small objects.  Objects are cached and
-    released in bulk, thus reducing the total number of mutex operations.  See
-    the "opt.tcache" option for usage details.
-
--disable-munmap
-    Disable virtual memory deallocation via munmap(2); instead keep track of
-    the virtual memory for later use.  munmap() is disabled by default (i.e.
-    --disable-munmap is implied) on Linux, which has a quirk in its virtual
-    memory allocation algorithm that causes semi-permanent VM map holes under
-    normal jemalloc operation.
-
--disable-fill
-    Disable support for junk/zero filling of memory, quarantine, and redzones.
-    See the "opt.junk", "opt.zero", "opt.quarantine", and "opt.redzone" option
-    documentation for usage details.
-
--disable-valgrind
-    Disable support for Valgrind.
-
--disable-zone-allocator
-    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
-    the default allocator on OSX/iOS.
-
--enable-utrace
-    Enable utrace(2)-based allocation tracing.  This feature is not broadly
-    portable (FreeBSD has it, but Linux and OS X do not).
-
--enable-xmalloc
-    Enable support for optional immediate termination due to out-of-memory
-    errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
-    See the "opt.xmalloc" option documentation for usage details.
-
--enable-lazy-lock
-    Enable code that wraps pthread_create() to detect when an application
-    switches from single-threaded to multi-threaded mode, so that it can avoid
-    mutex locking/unlocking operations while in single-threaded mode.  In
-    practice, this feature usually has little impact on performance unless
-    thread-specific caching is disabled.
-
--disable-tls
-    Disable thread-local storage (TLS), which allows for fast access to
-    thread-local variables via the __thread keyword.  If TLS is available,
-    jemalloc uses it for several purposes.
-
--disable-cache-oblivious
-    Disable cache-oblivious large allocation alignment for large allocation
-    requests with no alignment constraints.  If this feature is disabled, all
-    large allocations are page-aligned as an implementation artifact, which can
-    severely harm CPU cache utilization.  However, the cache-oblivious layout
-    comes at the cost of one extra page per large allocation, which in the
-    most extreme case increases physical memory usage for the 16 KiB size class
-    to 20 KiB.
-
--with-xslroot=<path>
-    Specify where to find DocBook XSL stylesheets when building the
-    documentation.
-
--with-lg-page=<lg-page>
-    Specify the base 2 log of the system page size.  This option is only useful
-    when cross compiling, since the configure script automatically determines
-    the host's page size by default.
-
--with-lg-page-sizes=<lg-page-sizes>
-    Specify the comma-separated base 2 logs of the page sizes to support.  This
-    option may be useful when cross-compiling in combination with
-    --with-lg-page, but its primary use case is for integration with FreeBSD's
-    libc, wherein jemalloc is embedded.
-
--with-lg-size-class-group=<lg-size-class-group>
-    Specify the base 2 log of how many size classes to use for each doubling in
-    size.  By default jemalloc uses <lg-size-class-group>=2, which results in
-    e.g. the following size classes:
-
-      [...], 64,
-      80, 96, 112, 128,
-      160, [...]
-
-    <lg-size-class-group>=3 results in e.g. the following size classes:
-
-      [...], 64,
-      72, 80, 88, 96, 104, 112, 120, 128,
-      144, [...]
-
-    The minimal <lg-size-class-group>=0 causes jemalloc to only provide size
-    classes that are powers of 2:
-
-      [...],
-      64,
-      128,
-      256,
-      [...]
-
-    An implementation detail currently limits the total number of small size
-    classes to 255, and a compilation error will result if the
-    <lg-size-class-group> you specify cannot be supported.  The limit is
-    roughly <lg-size-class-group>=4, depending on page size.
-
--with-lg-quantum=<lg-quantum>
-    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
-    to know the minimum alignment that meets the following C standard
-    requirement (quoted from the April 12, 2011 draft of the C11 standard):
-
-      The pointer returned if the allocation succeeds is suitably aligned so
-      that it may be assigned to a pointer to any type of object with a
-      fundamental alignment requirement and then used to access such an object
-      or an array of such objects in the space allocated [...]
-
-    This setting is architecture-specific, and although jemalloc includes known
-    safe values for the most commonly used modern architectures, there is a
-    wrinkle related to GNU libc (glibc) that may impact your choice of
-    <lg-quantum>.  On most modern architectures, this mandates 16-byte alignment
-    (<lg-quantum>=4), but the glibc developers chose not to meet this
-    requirement for performance reasons.  An old discussion can be found at
-    https://sourceware.org/bugzilla/show_bug.cgi?id=206 .  Unlike glibc,
-    jemalloc does follow the C standard by default (caveat: jemalloc
-    technically cheats if --with-lg-tiny-min is smaller than
-    --with-lg-quantum), but the fact that Linux systems already work around
-    this allocator noncompliance means that it is generally safe in practice to
-    let jemalloc's minimum alignment follow glibc's lead.  If you specify
-    --with-lg-quantum=3 during configuration, jemalloc will provide additional
-    size classes that are not 16-byte-aligned (24, 40, and 56, assuming
-    --with-lg-size-class-group=2).
-
--with-lg-tiny-min=<lg-tiny-min>
-    Specify the base 2 log of the minimum tiny size class to support.  Tiny
-    size classes are powers of 2 less than the quantum, and are only
-    incorporated if <lg-tiny-min> is less than <lg-quantum> (see
-    --with-lg-quantum).  Tiny size classes technically violate the C standard
-    requirement for minimum alignment, and crashes could conceivably result if
-    the compiler were to generate instructions that made alignment assumptions,
-    both because illegal instruction traps could result, and because accesses
-    could straddle page boundaries and cause segmentation faults due to
-    accessing unmapped addresses.
-
-    The default of <lg-tiny-min>=3 works well in practice even on architectures
-    that technically require 16-byte alignment, probably for the same reason
-    --with-lg-quantum=3 works.  Smaller tiny size classes can, and will, cause
-    crashes (see https://bugzilla.mozilla.org/show_bug.cgi?id=691003 for an
-    example).
-
-    This option is rarely useful, and is mainly provided as documentation of a
-    subtle implementation detail.  If you do use this option, specify a
-    value in [3, ..., <lg-quantum>].
-
-The following environment variables (not a definitive list) impact configure's
-behavior:
-
-CFLAGS="?"
-    Pass these flags to the compiler.  You probably shouldn't define this unless
-    you know what you are doing.  (Use EXTRA_CFLAGS instead.)
-
-EXTRA_CFLAGS="?"
-    Append these flags to CFLAGS.  This makes it possible to add flags such as
-    -Werror, while allowing the configure script to determine what other flags
-    are appropriate for the specified configuration.
-
-    The configure script specifically checks whether an optimization flag (-O*)
-    is specified in EXTRA_CFLAGS, and refrains from specifying an optimization
-    level if it finds that one has already been specified.
-
-CPPFLAGS="?"
-    Pass these flags to the C preprocessor.  Note that CFLAGS is not passed to
-    'cpp' when 'configure' is looking for include files, so you must use
-    CPPFLAGS instead if you need to help 'configure' find header files.
-
-LD_LIBRARY_PATH="?"
-    'ld' uses this colon-separated list to find libraries.
-
-LDFLAGS="?"
-    Pass these flags when linking.
-
-PATH="?"
-    'configure' uses this to find programs.
-
-=== Advanced compilation =======================================================
-
-To build only parts of jemalloc, use the following targets:
-
-    build_lib_shared
-    build_lib_static
-    build_lib
-    build_doc_html
-    build_doc_man
-    build_doc
-
-To install only parts of jemalloc, use the following targets:
-
-    install_bin
-    install_include
-    install_lib_shared
-    install_lib_static
-    install_lib
-    install_doc_html
-    install_doc_man
-    install_doc
-
-To clean up build results to varying degrees, use the following make targets:
-
-    clean
-    distclean
-    relclean
-
-=== Advanced installation ======================================================
-
-Optionally, define make variables when invoking make, including (not
-exclusively):
-
-INCLUDEDIR="?"
-    Use this as the installation prefix for header files.
-
-LIBDIR="?"
-    Use this as the installation prefix for libraries.
-
-MANDIR="?"
-    Use this as the installation prefix for man pages.
-
-DESTDIR="?"
-    Prepend DESTDIR to INCLUDEDIR, LIBDIR, DATADIR, and MANDIR.  This is useful
-    when installing to a different path than was specified via --prefix.
-
-CC="?"
-    Use this to invoke the C compiler.
-
-CFLAGS="?"
-    Pass these flags to the compiler.
-
-CPPFLAGS="?"
-    Pass these flags to the C preprocessor.
-
-LDFLAGS="?"
-    Pass these flags when linking.
-
-PATH="?"
-    Use this to search for programs used during configuration and building.
-
-=== Development ================================================================
-
-If you intend to make non-trivial changes to jemalloc, use the 'autogen.sh'
-script rather than 'configure'.  This re-generates 'configure', enables
-configuration dependency rules, and enables re-generation of automatically
-generated source files.
-
-The build system supports using an object directory separate from the source
-tree.  For example, you can create an 'obj' directory, and from within that
-directory, issue configuration and build commands:
-
-    autoconf
-    mkdir obj
-    cd obj
-    ../configure --enable-autogen
-    make
-
-=== Documentation ==============================================================
-
-The manual page is generated in both html and roff formats.  Any web browser
-can be used to view the html manual.  The roff manual page can be formatted
-prior to installation via the following command:
-
-    nroff -man -t doc/jemalloc.3
--- a/INSTALL.md
+++ b/INSTALL.md
@ -0,0 +1,527 @@
+Building and installing a packaged release of jemalloc can be as simple as
+typing the following while in the root directory of the source tree:
+
+    ./configure
+    make
+    make install
+
+If building from unpackaged developer sources, the simplest command sequence
+that might work is:
+
+    ./autogen.sh
+    make
+    make install
+
+You can uninstall the installed build artifacts like this:
+
+    make uninstall
+
+Notes:
+ - "autoconf" needs to be installed
+ - Documentation is built by the default target only when xsltproc is
+available.  Build will warn but not stop if the dependency is missing.
+
+
+## Advanced configuration
+
+The 'configure' script supports numerous options that allow control of which
+functionality is enabled, where jemalloc is installed, etc.  Optionally, pass
+any of the following arguments (not a definitive list) to 'configure':
+
+* `--help`
+
+    Print a definitive list of options.
+
+* `--prefix=<install-root-dir>`
+
+    Set the base directory in which to install.  For example:
+
+        ./configure --prefix=/usr/local
+
+    will cause files to be installed into /usr/local/include, /usr/local/lib,
+    and /usr/local/man.
+
+* `--with-version=(<major>.<minor>.<bugfix>-<nrev>-g<gid>|VERSION)`
+
+    The VERSION file is mandatory for successful configuration, and the
+    following steps are taken to assure its presence:
+    1) If --with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid> is specified,
+       generate VERSION using the specified value.
+    2) If --with-version is not specified in either form and the source
+       directory is inside a git repository, try to generate VERSION via 'git
+       describe' invocations that pattern-match release tags.
+    3) If VERSION is missing, generate it with a bogus version:
+       0.0.0-0-g0000000000000000000000000000000000000000
+
+    Note that --with-version=VERSION bypasses (1) and (2), which simplifies
+    VERSION configuration when embedding a jemalloc release into another
+    project's git repository.
+
+* `--with-rpath=<colon-separated-rpath>`
+
+    Embed one or more library paths, so that libjemalloc can find the libraries
+    it is linked to.  This works only on ELF-based systems.
+
+* `--with-mangling=<map>`
+
+    Mangle public symbols specified in <map> which is a comma-separated list of
+    name:mangled pairs.
+
+    For example, to use ld's --wrap option as an alternative method for
+    overriding libc's malloc implementation, specify something like:
+
+      --with-mangling=malloc:__wrap_malloc,free:__wrap_free[...]
+
+    Note that mangling happens prior to application of the prefix specified by
+    --with-jemalloc-prefix, and mangled symbols are then ignored when applying
+    the prefix.
+
+* `--with-jemalloc-prefix=<prefix>`
+
+    Prefix all public APIs with <prefix>.  For example, if <prefix> is
+    "prefix_", API changes like the following occur:
+
+      malloc()         --> prefix_malloc()
+      malloc_conf      --> prefix_malloc_conf
+      /etc/malloc.conf --> /etc/prefix_malloc.conf
+      MALLOC_CONF      --> PREFIX_MALLOC_CONF
+
+    This makes it possible to use jemalloc at the same time as the system
+    allocator, or even to use multiple copies of jemalloc simultaneously.
+
+    By default, the prefix is "", except on OS X, where it is "je_".  On OS X,
+    jemalloc overlays the default malloc zone, but makes no attempt to actually
+    replace the "malloc", "calloc", etc. symbols.
+
+* `--without-export`
+
+    Don't export public APIs.  This can be useful when building jemalloc as a
+    static library, or to avoid exporting public APIs when using the zone
+    allocator on OSX.
+
+* `--with-private-namespace=<prefix>`
+
+    Prefix all library-private APIs with <prefix>je_.  For shared libraries,
+    symbol visibility mechanisms prevent these symbols from being exported, but
+    for static libraries, naming collisions are a real possibility.  By
+    default, <prefix> is empty, which results in a symbol prefix of je_ .
+
+* `--with-install-suffix=<suffix>`
+
+    Append <suffix> to the base name of all installed files, such that multiple
+    versions of jemalloc can coexist in the same installation directory.  For
+    example, libjemalloc.so.0 becomes libjemalloc<suffix>.so.0.
+
+* `--with-malloc-conf=<malloc_conf>`
+
+    Embed `<malloc_conf>` as a run-time options string that is processed prior to
+    the malloc_conf global variable, the /etc/malloc.conf symlink, and the
+    MALLOC_CONF environment variable.  For example, to change the default decay
+    time to 30 seconds:
+
+      --with-malloc-conf=decay_ms:30000
+
+* `--enable-debug`
+
+    Enable assertions and validation code.  This incurs a substantial
+    performance hit, but is very useful during application development.
+
+* `--disable-stats`
+
+    Disable statistics gathering functionality.  See the "opt.stats_print"
+    option documentation for usage details.
+
+* `--enable-prof`
+
+    Enable heap profiling and leak detection functionality.  See the "opt.prof"
+    option documentation for usage details.  When enabled, there are several
+    approaches to backtracing, and the configure script chooses the first one
+    in the following list that appears to function correctly:
+
+    + libunwind      (requires --enable-prof-libunwind)
+    + frame pointer  (requires --enable-prof-frameptr)
+    + libgcc         (unless --disable-prof-libgcc)
+    + gcc intrinsics (unless --disable-prof-gcc)
+
+* `--enable-prof-libunwind`
+
+    Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
+    backtracing.
+
+* `--enable-prof-frameptr`
+
+    Use the optimized frame pointer unwinder for stack backtracing. Safe
+    to use in mixed code (with and without frame pointers) - but requires
+    frame pointers to produce meaningful stacks. Linux only.
+
+* `--disable-prof-libgcc`
+
+    Disable the use of libgcc's backtracing functionality.
+
+* `--disable-prof-gcc`
+
+    Disable the use of gcc intrinsics for backtracing.
+
+* `--with-static-libunwind=<libunwind.a>`
+
+    Statically link against the specified libunwind.a rather than dynamically
+    linking with -lunwind.
+
+* `--disable-fill`
+
+    Disable support for junk/zero filling of memory.  See the "opt.junk" and
+    "opt.zero" option documentation for usage details.
+
+* `--disable-zone-allocator`
+
+    Disable zone allocator for Darwin.  This means jemalloc won't be hooked as
+    the default allocator on OSX/iOS.
+
+* `--enable-utrace`
+
+    Enable utrace(2)-based allocation tracing.  This feature is not broadly
+    portable (FreeBSD has it, but Linux and OS X do not).
+
+* `--enable-xmalloc`
+
+    Enable support for optional immediate termination due to out-of-memory
+    errors, as is commonly implemented by "xmalloc" wrapper function for malloc.
+    See the "opt.xmalloc" option documentation for usage details.
+
+* `--enable-lazy-lock`
+
+    Enable code that wraps pthread_create() to detect when an application
+    switches from single-threaded to multi-threaded mode, so that it can avoid
+    mutex locking/unlocking operations while in single-threaded mode.  In
+    practice, this feature usually has little impact on performance unless
+    thread-specific caching is disabled.
+
+* `--disable-cache-oblivious`
+
+    Disable cache-oblivious large allocation alignment by default, for large
+    allocation requests with no alignment constraints.  If this feature is
+    disabled, all large allocations are page-aligned as an implementation
+    artifact, which can severely harm CPU cache utilization.  However, the
+    cache-oblivious layout comes at the cost of one extra page per large
+    allocation, which in the most extreme case increases physical memory usage
+    for the 16 KiB size class to 20 KiB.
+
+* `--disable-syscall`
+
+    Disable use of syscall(2) rather than {open,read,write,close}(2).  This is
+    intended as a workaround for systems that place security limitations on
+    syscall(2).
+
+* `--disable-cxx`
+
+    Disable C++ integration.  This will cause new and delete operator
+    implementations to be omitted.
+
+* `--with-xslroot=<path>`
+
+    Specify where to find DocBook XSL stylesheets when building the
+    documentation.
+
+* `--with-lg-page=<lg-page>`
+
+    Specify the base 2 log of the allocator page size, which must in turn be at
+    least as large as the system page size.  By default the configure script
+    determines the host's page size and sets the allocator page size equal to
+    the system page size, so this option need not be specified unless the
+    system page size may change between configuration and execution, e.g. when
+    cross compiling.
+
+* `--with-lg-hugepage=<lg-hugepage>`
+
+    Specify the base 2 log of the system huge page size.  This option is useful
+    when cross compiling, or when overriding the default for systems that do
+    not explicitly support huge pages.
+
+* `--with-lg-quantum=<lg-quantum>`
+
+    Specify the base 2 log of the minimum allocation alignment.  jemalloc needs
+    to know the minimum alignment that meets the following C standard
+    requirement (quoted from the April 12, 2011 draft of the C11 standard):
+
+    >  The pointer returned if the allocation succeeds is suitably aligned so
+      that it may be assigned to a pointer to any type of object with a
+      fundamental alignment requirement and then used to access such an object
+      or an array of such objects in the space allocated [...]
+
+    This setting is architecture-specific, and although jemalloc includes known
+    safe values for the most commonly used modern architectures, there is a
+    wrinkle related to GNU libc (glibc) that may impact your choice of
+    <lg-quantum>.  On most modern architectures, this mandates 16-byte
+    alignment (<lg-quantum>=4), but the glibc developers chose not to meet this
+    requirement for performance reasons.  An old discussion can be found at
+    <https://sourceware.org/bugzilla/show_bug.cgi?id=206> .  Unlike glibc,
+    jemalloc does follow the C standard by default (caveat: jemalloc
+    technically cheats for size classes smaller than the quantum), but the fact
+    that Linux systems already work around this allocator noncompliance means
+    that it is generally safe in practice to let jemalloc's minimum alignment
+    follow glibc's lead.  If you specify `--with-lg-quantum=3` during
+    configuration, jemalloc will provide additional size classes that are not
+    16-byte-aligned (24, 40, and 56).
+
+* `--with-lg-vaddr=<lg-vaddr>`
+
+    Specify the number of significant virtual address bits.  By default, the
+    configure script attempts to detect virtual address size on those platforms
+    where it knows how, and picks a default otherwise.  This option may be
+    useful when cross-compiling.
+
+* `--disable-initial-exec-tls`
+
+    Disable the initial-exec TLS model for jemalloc's internal thread-local
+    storage (on those platforms that support explicit settings).  This can allow
+    jemalloc to be dynamically loaded after program startup (e.g. using dlopen).
+    Note that in this case, there will be two malloc implementations operating
+    in the same process, which will almost certainly result in confusing runtime
+    crashes if pointers leak from one implementation to the other.
+
+* `--disable-libdl`
+
+    Disable the usage of libdl, namely dlsym(3) which is required by the lazy
+    lock option.  This can allow building static binaries.
+
+The following environment variables (not a definitive list) impact configure's
+behavior:
+
+* `CFLAGS="?"`
+* `CXXFLAGS="?"`
+
+    Pass these flags to the C/C++ compiler.  Any flags set by the configure
+    script are prepended, which means explicitly set flags generally take
+    precedence.  Take care when specifying flags such as -Werror, because
+    configure tests may be affected in undesirable ways.
+
+* `EXTRA_CFLAGS="?"`
+* `EXTRA_CXXFLAGS="?"`
+
+    Append these flags to CFLAGS/CXXFLAGS, without passing them to the
+    compiler(s) during configuration.  This makes it possible to add flags such
+    as -Werror, while allowing the configure script to determine what other
+    flags are appropriate for the specified configuration.
+
+* `CPPFLAGS="?"`
+
+    Pass these flags to the C preprocessor.  Note that CFLAGS is not passed to
+    'cpp' when 'configure' is looking for include files, so you must use
+    CPPFLAGS instead if you need to help 'configure' find header files.
+
+* `LD_LIBRARY_PATH="?"`
+
+    'ld' uses this colon-separated list to find libraries.
+
+* `LDFLAGS="?"`
+
+    Pass these flags when linking.
+
+* `PATH="?"`
+
+    'configure' uses this to find programs.
+
+In some cases it may be necessary to work around configuration results that do
+not match reality.  For example, Linux 3.4 added support for the MADV_DONTDUMP
+flag to madvise(2), which can cause problems if building on a host with
+MADV_DONTDUMP support and deploying to a target without.  To work around this,
+use a cache file to override the relevant configuration variable defined in
+configure.ac, e.g.:
+
+    echo "je_cv_madv_dontdump=no" > config.cache && ./configure -C
+
+
+## Advanced compilation
+
+To build only parts of jemalloc, use the following targets:
+
+    build_lib_shared
+    build_lib_static
+    build_lib
+    build_doc_html
+    build_doc_man
+    build_doc
+
+To install only parts of jemalloc, use the following targets:
+
+    install_bin
+    install_include
+    install_lib_shared
+    install_lib_static
+    install_lib_pc
+    install_lib
+    install_doc_html
+    install_doc_man
+    install_doc
+
+To clean up build results to varying degrees, use the following make targets:
+
+    clean
+    distclean
+    relclean
+
+
+## Advanced installation
+
+Optionally, define make variables when invoking make, including (not
+exclusively):
+
+* `INCLUDEDIR="?"`
+
+    Use this as the installation prefix for header files.
+
+* `LIBDIR="?"`
+
+    Use this as the installation prefix for libraries.
+
+* `MANDIR="?"`
+
+    Use this as the installation prefix for man pages.
+
+* `DESTDIR="?"`
+
+    Prepend DESTDIR to INCLUDEDIR, LIBDIR, DATADIR, and MANDIR.  This is useful
+    when installing to a different path than was specified via --prefix.
+
+* `CC="?"`
+
+    Use this to invoke the C compiler.
+
+* `CFLAGS="?"`
+
+    Pass these flags to the compiler.
+
+* `CPPFLAGS="?"`
+
+    Pass these flags to the C preprocessor.
+
+* `LDFLAGS="?"`
+
+    Pass these flags when linking.
+
+* `PATH="?"`
+
+    Use this to search for programs used during configuration and building.
+
+## Building for Windows
+
+There are at least two ways to build jemalloc's libraries for Windows. They
+differ in their ease of use and flexibility.
+
+### With MSVC solutions
+This is the easy, but less flexible approach. It doesn't let you specify
+arguments to the `configure` script.
+  
+1. Install Cygwin with at least the following packages:
+   * autoconf
+   * autogen
+   * gawk
+   * grep
+   * sed
+
+2. Install Visual Studio 2015 or 2017 with Visual C++
+
+3. Add Cygwin\bin to the PATH environment variable
+
+4. Open "x64 Native Tools Command Prompt for VS 2017"
+   (note: x86/x64 doesn't matter at this point)
+
+5. Generate header files:
+   sh -c "CC=cl ./autogen.sh"
+
+6. Now the project can be opened and built in Visual Studio:
+   msvc\jemalloc_vc2017.sln
+
+### With MSYS
+This is a more involved approach that offers the same configuration flexibility
+as Linux builds. We use it for our CI workflow to test different jemalloc
+configurations on Windows.
+
+1. Install the prerequisites
+    1. MSYS2
+    2. Chocolatey
+    3. Visual Studio if you want to compile with MSVC compiler
+
+2. Run your bash emulation. It could be MSYS2 or Git Bash (this manual was
+   tested on both)
+3. Manually and selectively follow
+   [before_install.sh](https://github.com/jemalloc/jemalloc/blob/dev/scripts/windows/before_install.sh)
+   script.
+    1. Skip the `TRAVIS_OS_NAME` check, `rm -rf C:/tools/msys64` and `choco
+       uninstall/upgrade` part.
+    2.  If using `msys2` shell, add path to `RefreshEnv.cmd` to `PATH`:
+        `PATH="$PATH:/c/ProgramData/chocolatey/bin"`
+    3. Assign `msys_shell_cmd`, `msys2`, `mingw32` and `mingw64` as in the
+       script.
+    4. Pick `CROSS_COMPILE_32BIT` , `CC` and `USE_MSVC` values depending on
+       your needs. For instance, if you'd like to build for x86_64 Windows
+       with `gcc`, then `CROSS_COMPILE_32BIT="no"`, `CC="gcc"` and
+       `USE_MSVC=""`. If you'd like to build for x86 Windows with `cl.exe`,
+       then `CROSS_COMPILE_32BIT="yes"`, `CC="cl.exe"`, `USE_MSVC="x86"`.
+       For x86_64 builds with `cl.exe`, assign `USE_MSVC="amd64"` and
+       `CROSS_COMPILE_32BIT="no"`.
+    5. Replace the path to `vcvarsall.bat` with the path on your system. For
+       instance, on my Windows PC with Visual Studio 17, the path is
+       `C:\Program Files (x86)\Microsoft Visual
+       Studio\2017\BuildTools\VC\Auxiliary\Build\vcvarsall.bat`.
+    6. Execute the rest of the script. It will install the required
+       dependencies and assign the variable `build_env`, which is a function
+       that executes following commands with the correct environment
+       variables set.
+4. Use `$build_env <command>` as you would in a Linux shell:
+     1. `$build_env autoconf`
+     2. `$build_env ./configure CC="<desired compiler>" <configuration flags>`
+     3. `$build_env mingw32-make`
+
+If you're having any issues with the above, ensure the following:
+
+5. When you run `cmd //C RefreshEnv.cmd`, you get an output line starting with
+   `Refreshing` . If it errors saying `RefreshEnv.cmd` is not found, then you
+   need to add it to your `PATH` as described above in item 3.2
+
+6. When you run `cmd //C $vcvarsall`, it prints a bunch of environment
+   variables. Otherwise, check the path to the `vcvarsall.bat` in `$vcvarsall`
+   script and fix it.
+
+### Building from vcpkg
+
+The jemalloc port in vcpkg is kept up to date by Microsoft team members and
+community contributors. The url of vcpkg is: https://github.com/Microsoft/vcpkg
+. You can download and install jemalloc using the vcpkg dependency manager:
+
+```shell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh  # ./bootstrap-vcpkg.bat for Windows
+./vcpkg integrate install
+./vcpkg install jemalloc
+```
+
+If the version is out of date, please [create an issue or pull
+request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.
+
+## Development
+
+If you intend to make non-trivial changes to jemalloc, use the 'autogen.sh'
+script rather than 'configure'.  This re-generates 'configure', enables
+configuration dependency rules, and enables re-generation of automatically
+generated source files.
+
+The build system supports using an object directory separate from the source
+tree.  For example, you can create an 'obj' directory, and from within that
+directory, issue configuration and build commands:
+
+    autoconf
+    mkdir obj
+    cd obj
+    ../configure --enable-autogen
+    make
+
+
+## Documentation
+
+The manual page is generated in both html and roff formats.  Any web browser
+can be used to view the html manual.  The roff manual page can be formatted
+prior to installation via the following command:
+
+    nroff -man -t doc/jemalloc.3
--- a/Makefile.in
+++ b/Makefile.in
@ -9,6 +9,7 @@ vpath % .
 SHELL := /bin/sh

 CC := @CC@
+CXX := @CXX@

 # Configuration parameters.
 DESTDIR =
@ -23,9 +24,15 @@ abs_srcroot := @abs_srcroot@
 abs_objroot := @abs_objroot@

 # Build parameters.
-CPPFLAGS := @CPPFLAGS@ -I$(srcroot)include -I$(objroot)include
+CPPFLAGS := @CPPFLAGS@ -I$(objroot)include -I$(srcroot)include
+CONFIGURE_CFLAGS := @CONFIGURE_CFLAGS@
+SPECIFIED_CFLAGS := @SPECIFIED_CFLAGS@
 EXTRA_CFLAGS := @EXTRA_CFLAGS@
-CFLAGS := @CFLAGS@ $(EXTRA_CFLAGS)
+CFLAGS := $(strip $(CONFIGURE_CFLAGS) $(SPECIFIED_CFLAGS) $(EXTRA_CFLAGS))
+CONFIGURE_CXXFLAGS := @CONFIGURE_CXXFLAGS@
+SPECIFIED_CXXFLAGS := @SPECIFIED_CXXFLAGS@
+EXTRA_CXXFLAGS := @EXTRA_CXXFLAGS@
+CXXFLAGS := $(strip $(CONFIGURE_CXXFLAGS) $(SPECIFIED_CXXFLAGS) $(EXTRA_CXXFLAGS))
 LDFLAGS := @LDFLAGS@
 EXTRA_LDFLAGS := @EXTRA_LDFLAGS@
 LIBS := @LIBS@
@ -40,6 +47,7 @@ REV := @rev@
 install_suffix := @install_suffix@
 ABI := @abi@
 XSLTPROC := @XSLTPROC@
+XSLROOT := @XSLROOT@
 AUTOCONF := @AUTOCONF@
 _RPATH = @RPATH@
 RPATH = $(if $(1),$(call _RPATH,$(1)))
@ -48,10 +56,12 @@ cfghdrs_out := @cfghdrs_out@
 cfgoutputs_in := $(addprefix $(srcroot),@cfgoutputs_in@)
 cfgoutputs_out := @cfgoutputs_out@
 enable_autogen := @enable_autogen@
-enable_code_coverage := @enable_code_coverage@
+enable_doc := @enable_doc@
+enable_shared := @enable_shared@
+enable_static := @enable_static@
 enable_prof := @enable_prof@
-enable_valgrind := @enable_valgrind@
 enable_zone_allocator := @enable_zone_allocator@
+enable_experimental_smallocx := @enable_experimental_smallocx@
 MALLOC_CONF := @JEMALLOC_CPREFIX@MALLOC_CONF
 link_whole_archive := @link_whole_archive@
 DSO_LDFLAGS = @DSO_LDFLAGS@
@ -63,6 +73,8 @@ TEST_LD_MODE = @TEST_LD_MODE@
 MKLIB = @MKLIB@
 AR = @AR@
 ARFLAGS = @ARFLAGS@
+DUMP_SYMS = @DUMP_SYMS@
+AWK := @AWK@
 CC_MM = @CC_MM@
 LM := @LM@
 INSTALL = @INSTALL@
@ -84,35 +96,71 @@ BINS := $(objroot)bin/jemalloc-config $(objroot)bin/jemalloc.sh $(objroot)bin/je
 C_HDRS := $(objroot)include/jemalloc/jemalloc$(install_suffix).h
 C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/arena.c \
-	$(srcroot)src/atomic.c \
+	$(srcroot)src/background_thread.c \
 	$(srcroot)src/base.c \
+	$(srcroot)src/bin.c \
+	$(srcroot)src/bin_info.c \
 	$(srcroot)src/bitmap.c \
-	$(srcroot)src/chunk.c \
-	$(srcroot)src/chunk_dss.c \
-	$(srcroot)src/chunk_mmap.c \
+	$(srcroot)src/buf_writer.c \
+	$(srcroot)src/cache_bin.c \
 	$(srcroot)src/ckh.c \
+	$(srcroot)src/counter.c \
 	$(srcroot)src/ctl.c \
+	$(srcroot)src/decay.c \
+	$(srcroot)src/div.c \
+	$(srcroot)src/ecache.c \
+	$(srcroot)src/edata.c \
+	$(srcroot)src/edata_cache.c \
+	$(srcroot)src/ehooks.c \
+	$(srcroot)src/emap.c \
+	$(srcroot)src/eset.c \
+	$(srcroot)src/exp_grow.c \
 	$(srcroot)src/extent.c \
-	$(srcroot)src/hash.c \
-	$(srcroot)src/huge.c \
-	$(srcroot)src/mb.c \
+	$(srcroot)src/extent_dss.c \
+	$(srcroot)src/extent_mmap.c \
+	$(srcroot)src/fxp.c \
+	$(srcroot)src/san.c \
+	$(srcroot)src/san_bump.c \
+	$(srcroot)src/hook.c \
+	$(srcroot)src/hpa.c \
+	$(srcroot)src/hpa_central.c \
+	$(srcroot)src/hpa_hooks.c \
+	$(srcroot)src/hpa_utils.c \
+	$(srcroot)src/hpdata.c \
+	$(srcroot)src/inspect.c \
+	$(srcroot)src/large.c \
+	$(srcroot)src/log.c \
+	$(srcroot)src/malloc_io.c \
+	$(srcroot)src/conf.c \
 	$(srcroot)src/mutex.c \
 	$(srcroot)src/nstime.c \
+	$(srcroot)src/pa.c \
+	$(srcroot)src/pa_extra.c \
+	$(srcroot)src/pac.c \
 	$(srcroot)src/pages.c \
-	$(srcroot)src/prng.c \
+	$(srcroot)src/peak_event.c \
 	$(srcroot)src/prof.c \
-	$(srcroot)src/quarantine.c \
+	$(srcroot)src/prof_data.c \
+	$(srcroot)src/prof_log.c \
+	$(srcroot)src/prof_recent.c \
+	$(srcroot)src/prof_stack_range.c \
+	$(srcroot)src/prof_stats.c \
+	$(srcroot)src/prof_sys.c \
+	$(srcroot)src/psset.c \
 	$(srcroot)src/rtree.c \
+	$(srcroot)src/safety_check.c \
+	$(srcroot)src/sc.c \
+	$(srcroot)src/sec.c \
 	$(srcroot)src/stats.c \
-	$(srcroot)src/spin.c \
+	$(srcroot)src/sz.c \
 	$(srcroot)src/tcache.c \
+	$(srcroot)src/test_hooks.c \
+	$(srcroot)src/thread_event.c \
+	$(srcroot)src/thread_event_registry.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
 	$(srcroot)src/util.c \
 	$(srcroot)src/witness.c
-ifeq ($(enable_valgrind), 1)
-C_SRCS += $(srcroot)src/valgrind.c
-endif
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
 endif
@ -134,108 +182,255 @@ else
 LJEMALLOC := $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 endif
 PC := $(objroot)jemalloc.pc
-MAN3 := $(objroot)doc/jemalloc$(install_suffix).3
 DOCS_XML := $(objroot)doc/jemalloc$(install_suffix).xml
 DOCS_HTML := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.html)
 DOCS_MAN3 := $(DOCS_XML:$(objroot)%.xml=$(objroot)%.3)
 DOCS := $(DOCS_HTML) $(DOCS_MAN3)
 C_TESTLIB_SRCS := $(srcroot)test/src/btalloc.c $(srcroot)test/src/btalloc_0.c \
 	$(srcroot)test/src/btalloc_1.c $(srcroot)test/src/math.c \
-	$(srcroot)test/src/mtx.c $(srcroot)test/src/mq.c \
+	$(srcroot)test/src/mtx.c $(srcroot)test/src/sleep.c \
 	$(srcroot)test/src/SFMT.c $(srcroot)test/src/test.c \
 	$(srcroot)test/src/thd.c $(srcroot)test/src/timer.c
 ifeq (1, $(link_whole_archive))
 C_UTIL_INTEGRATION_SRCS :=
+C_UTIL_CPP_SRCS :=
 else
-C_UTIL_INTEGRATION_SRCS := $(srcroot)src/nstime.c $(srcroot)src/util.c
+C_UTIL_INTEGRATION_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c \
+	$(srcroot)src/ticker.c
+C_UTIL_CPP_SRCS := $(srcroot)src/nstime.c $(srcroot)src/malloc_io.c
 endif
 TESTS_UNIT := \
 	$(srcroot)test/unit/a0.c \
+	$(srcroot)test/unit/arena_decay.c \
 	$(srcroot)test/unit/arena_reset.c \
 	$(srcroot)test/unit/atomic.c \
+	$(srcroot)test/unit/background_thread.c \
+	$(srcroot)test/unit/background_thread_enable.c \
+	$(srcroot)test/unit/background_thread_init.c \
+	$(srcroot)test/unit/base.c \
+	$(srcroot)test/unit/batch_alloc.c \
+	$(srcroot)test/unit/bin.c \
+	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/bitmap.c \
+	$(srcroot)test/unit/bit_util.c \
+	$(srcroot)test/unit/buf_writer.c \
+	$(srcroot)test/unit/cache_bin.c \
 	$(srcroot)test/unit/ckh.c \
+	$(srcroot)test/unit/conf.c \
+	$(srcroot)test/unit/conf_init_0.c \
+	$(srcroot)test/unit/conf_init_1.c \
+	$(srcroot)test/unit/conf_init_confirm.c \
+	$(srcroot)test/unit/conf_parse.c \
+	$(srcroot)test/unit/counter.c \
 	$(srcroot)test/unit/decay.c \
+	$(srcroot)test/unit/div.c \
+	$(srcroot)test/unit/double_free.c \
+	$(srcroot)test/unit/edata_cache.c \
+	$(srcroot)test/unit/emitter.c \
+	$(srcroot)test/unit/extent_quantize.c \
+	${srcroot}test/unit/fb.c \
 	$(srcroot)test/unit/fork.c \
+	${srcroot}test/unit/fxp.c \
+	${srcroot}test/unit/san.c \
+	${srcroot}test/unit/san_bump.c \
 	$(srcroot)test/unit/hash.c \
+	$(srcroot)test/unit/hook.c \
+	$(srcroot)test/unit/hpa.c \
+	$(srcroot)test/unit/hpa_sec_integration.c \
+	$(srcroot)test/unit/hpa_thp_always.c \
+	$(srcroot)test/unit/hpa_vectorized_madvise.c \
+	$(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \
+	$(srcroot)test/unit/hpa_background_thread.c \
+	$(srcroot)test/unit/hpdata.c \
+	$(srcroot)test/unit/huge.c \
+	$(srcroot)test/unit/inspect.c \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
-	$(srcroot)test/unit/lg_chunk.c \
+	$(srcroot)test/unit/json_stats.c \
+	$(srcroot)test/unit/large_ralloc.c \
+	$(srcroot)test/unit/log.c \
 	$(srcroot)test/unit/mallctl.c \
+	$(srcroot)test/unit/malloc_conf_2.c \
+	$(srcroot)test/unit/malloc_io.c \
 	$(srcroot)test/unit/math.c \
+	$(srcroot)test/unit/mpsc_queue.c \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
+	$(srcroot)test/unit/nstime.c \
+	$(srcroot)test/unit/ncached_max.c \
+	$(srcroot)test/unit/oversize_threshold.c \
+	$(srcroot)test/unit/pa.c \
+	$(srcroot)test/unit/pack.c \
+	$(srcroot)test/unit/pages.c \
+	$(srcroot)test/unit/peak.c \
 	$(srcroot)test/unit/ph.c \
 	$(srcroot)test/unit/prng.c \
 	$(srcroot)test/unit/prof_accum.c \
 	$(srcroot)test/unit/prof_active.c \
 	$(srcroot)test/unit/prof_gdump.c \
+	$(srcroot)test/unit/prof_hook.c \
 	$(srcroot)test/unit/prof_idump.c \
+	$(srcroot)test/unit/prof_log.c \
+	$(srcroot)test/unit/prof_mdump.c \
+	$(srcroot)test/unit/prof_recent.c \
 	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_small.c \
+	$(srcroot)test/unit/prof_stats.c \
+	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
+	$(srcroot)test/unit/prof_sys_thread_name.c \
+	$(srcroot)test/unit/psset.c \
 	$(srcroot)test/unit/ql.c \
 	$(srcroot)test/unit/qr.c \
-	$(srcroot)test/unit/quarantine.c \
 	$(srcroot)test/unit/rb.c \
+	$(srcroot)test/unit/retained.c \
 	$(srcroot)test/unit/rtree.c \
-	$(srcroot)test/unit/run_quantize.c \
+	$(srcroot)test/unit/safety_check.c \
+	$(srcroot)test/unit/sc.c \
+	$(srcroot)test/unit/sec.c \
+	$(srcroot)test/unit/seq.c \
 	$(srcroot)test/unit/SFMT.c \
+	$(srcroot)test/unit/size_check.c \
 	$(srcroot)test/unit/size_classes.c \
+	$(srcroot)test/unit/slab.c \
 	$(srcroot)test/unit/smoothstep.c \
+	$(srcroot)test/unit/spin.c \
 	$(srcroot)test/unit/stats.c \
+	$(srcroot)test/unit/stats_print.c \
+	$(srcroot)test/unit/sz.c \
+	$(srcroot)test/unit/tcache_init.c \
+	$(srcroot)test/unit/tcache_max.c \
+	$(srcroot)test/unit/test_hooks.c \
+	$(srcroot)test/unit/thread_event.c \
 	$(srcroot)test/unit/ticker.c \
-	$(srcroot)test/unit/nstime.c \
 	$(srcroot)test/unit/tsd.c \
-	$(srcroot)test/unit/util.c \
+	$(srcroot)test/unit/uaf.c \
 	$(srcroot)test/unit/witness.c \
-	$(srcroot)test/unit/zero.c
+	$(srcroot)test/unit/zero.c \
+	$(srcroot)test/unit/zero_realloc_abort.c \
+	$(srcroot)test/unit/zero_realloc_free.c \
+	$(srcroot)test/unit/zero_realloc_alloc.c \
+	$(srcroot)test/unit/zero_reallocs.c
+ifeq (@enable_prof@, 1)
+TESTS_UNIT += \
+	$(srcroot)test/unit/arena_reset_prof.c \
+	$(srcroot)test/unit/batch_alloc_prof.c
+endif
 TESTS_INTEGRATION := $(srcroot)test/integration/aligned_alloc.c \
 	$(srcroot)test/integration/allocated.c \
-	$(srcroot)test/integration/sdallocx.c \
+	$(srcroot)test/integration/extent.c \
+	$(srcroot)test/integration/malloc.c \
 	$(srcroot)test/integration/mallocx.c \
 	$(srcroot)test/integration/MALLOCX_ARENA.c \
 	$(srcroot)test/integration/overflow.c \
 	$(srcroot)test/integration/posix_memalign.c \
 	$(srcroot)test/integration/rallocx.c \
+	$(srcroot)test/integration/sdallocx.c \
+	$(srcroot)test/integration/slab_sizes.c \
 	$(srcroot)test/integration/thread_arena.c \
 	$(srcroot)test/integration/thread_tcache_enabled.c \
-	$(srcroot)test/integration/xallocx.c \
-	$(srcroot)test/integration/chunk.c
-TESTS_STRESS := $(srcroot)test/stress/microbench.c
-TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_STRESS)
+	$(srcroot)test/integration/xallocx.c
+ifeq (@enable_experimental_smallocx@, 1)
+TESTS_INTEGRATION += \
+  $(srcroot)test/integration/smallocx.c
+endif
+ifeq (@enable_cxx@, 1)
+CPP_SRCS := $(srcroot)src/jemalloc_cpp.cpp
+TESTS_INTEGRATION_CPP := $(srcroot)test/integration/cpp/basic.cpp \
+	$(srcroot)test/integration/cpp/infallible_new_true.cpp \
+	$(srcroot)test/integration/cpp/infallible_new_false.cpp
+else
+CPP_SRCS :=
+TESTS_INTEGRATION_CPP :=
+endif
+TESTS_ANALYZE := $(srcroot)test/analyze/prof_bias.c \
+	$(srcroot)test/analyze/rand.c \
+	$(srcroot)test/analyze/sizes.c
+TESTS_STRESS := $(srcroot)test/stress/batch_alloc.c \
+	$(srcroot)test/stress/fill_flush.c \
+	$(srcroot)test/stress/hookbench.c \
+	$(srcroot)test/stress/large_microbench.c \
+	$(srcroot)test/stress/mallctl.c \
+	$(srcroot)test/stress/microbench.c
+ifeq (@enable_cxx@, 1)
+TESTS_STRESS_CPP := $(srcroot)test/stress/cpp/microbench.cpp
+else
+TESTS_STRESS_CPP :=
+endif

+
+TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) \
+	$(TESTS_ANALYZE) $(TESTS_STRESS) $(TESTS_STRESS_CPP)
+
+PRIVATE_NAMESPACE_HDRS := $(objroot)include/jemalloc/internal/private_namespace.h $(objroot)include/jemalloc/internal/private_namespace_jet.h
+PRIVATE_NAMESPACE_GEN_HDRS := $(PRIVATE_NAMESPACE_HDRS:%.h=%.gen.h)
+C_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym.$(O))
+C_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.sym)
 C_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.$(O))
+CPP_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.$(O))
 C_PIC_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.pic.$(O))
+CPP_PIC_OBJS := $(CPP_SRCS:$(srcroot)%.cpp=$(objroot)%.pic.$(O))
+C_JET_SYM_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym.$(O))
+C_JET_SYMS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.sym)
 C_JET_OBJS := $(C_SRCS:$(srcroot)%.c=$(objroot)%.jet.$(O))
 C_TESTLIB_UNIT_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.unit.$(O))
 C_TESTLIB_INTEGRATION_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O))
 C_UTIL_INTEGRATION_OBJS := $(C_UTIL_INTEGRATION_SRCS:$(srcroot)%.c=$(objroot)%.integration.$(O))
+C_TESTLIB_ANALYZE_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.analyze.$(O))
 C_TESTLIB_STRESS_OBJS := $(C_TESTLIB_SRCS:$(srcroot)%.c=$(objroot)%.stress.$(O))
-C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_STRESS_OBJS)
+C_TESTLIB_OBJS := $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) \
+	$(C_UTIL_INTEGRATION_OBJS) $(C_TESTLIB_ANALYZE_OBJS) \
+	$(C_TESTLIB_STRESS_OBJS)

 TESTS_UNIT_OBJS := $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_INTEGRATION_CPP_OBJS := $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
+TESTS_ANALYZE_OBJS := $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
-TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_STRESS_OBJS)
+TESTS_STRESS_CPP_OBJS := $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
+TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_ANALYZE_OBJS) \
+	$(TESTS_STRESS_OBJS)
+TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS) $(TESTS_STRESS_CPP_OBJS)

 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
 .PHONY: install_doc_html install_doc_man install_doc install
 .PHONY: tests check clean distclean relclean

-.SECONDARY : $(TESTS_OBJS)
+.SECONDARY : $(PRIVATE_NAMESPACE_GEN_HDRS) $(TESTS_OBJS) $(TESTS_CPP_OBJS)

 # Default target.
 all: build_lib

 dist: build_doc

-$(objroot)doc/%.html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+$(objroot)doc/%$(install_suffix).html : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/html.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/html.xsl $<
+else
+ifeq ($(wildcard $(DOCS_HTML)),)
+	@echo "<p>Missing xsltproc.  Doc not built.</p>" > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif

-$(objroot)doc/%.3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+$(objroot)doc/%$(install_suffix).3 : $(objroot)doc/%.xml $(srcroot)doc/stylesheet.xsl $(objroot)doc/manpages.xsl
+ifneq ($(XSLROOT),)
 	$(XSLTPROC) -o $@ $(objroot)doc/manpages.xsl $<
+# The -o option (output filename) of xsltproc may not work (it uses the
+# <refname> in the .xml file).  Manually add the suffix if so.
+  ifneq ($(install_suffix),)
+	@if [ -f $(objroot)doc/jemalloc.3 ]; then \
+		mv $(objroot)doc/jemalloc.3 $(objroot)doc/jemalloc$(install_suffix).3 ; \
+	fi
+  endif
+else
+ifeq ($(wildcard $(DOCS_MAN3)),)
+	@echo "Missing xsltproc.  Doc not built." > $@
+endif
+	@echo "Missing xsltproc.  "$@" not (re)built."
+endif

 build_doc_html: $(DOCS_HTML)
 build_doc_man: $(DOCS_MAN3)
@ -245,84 +440,173 @@ build_doc: $(DOCS)
 # Include generated dependency files.
 #
 ifdef CC_MM
+-include $(C_SYM_OBJS:%.$(O)=%.d)
 -include $(C_OBJS:%.$(O)=%.d)
+-include $(CPP_OBJS:%.$(O)=%.d)
 -include $(C_PIC_OBJS:%.$(O)=%.d)
+-include $(CPP_PIC_OBJS:%.$(O)=%.d)
+-include $(C_JET_SYM_OBJS:%.$(O)=%.d)
 -include $(C_JET_OBJS:%.$(O)=%.d)
 -include $(C_TESTLIB_OBJS:%.$(O)=%.d)
 -include $(TESTS_OBJS:%.$(O)=%.d)
+-include $(TESTS_CPP_OBJS:%.$(O)=%.d)
 endif

+$(C_SYM_OBJS): $(objroot)src/%.sym.$(O): $(srcroot)src/%.c
+$(C_SYM_OBJS): CPPFLAGS += -DJEMALLOC_NO_PRIVATE_NAMESPACE
+$(C_SYMS): $(objroot)src/%.sym: $(objroot)src/%.sym.$(O)
 $(C_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.c
+$(CPP_OBJS): $(objroot)src/%.$(O): $(srcroot)src/%.cpp
 $(C_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.c
 $(C_PIC_OBJS): CFLAGS += $(PIC_CFLAGS)
+$(CPP_PIC_OBJS): $(objroot)src/%.pic.$(O): $(srcroot)src/%.cpp
+$(CPP_PIC_OBJS): CXXFLAGS += $(PIC_CFLAGS)
+$(C_JET_SYM_OBJS): $(objroot)src/%.jet.sym.$(O): $(srcroot)src/%.c
+$(C_JET_SYM_OBJS): CPPFLAGS += -DJEMALLOC_JET -DJEMALLOC_NO_PRIVATE_NAMESPACE
+$(C_JET_SYMS): $(objroot)src/%.jet.sym: $(objroot)src/%.jet.sym.$(O)
 $(C_JET_OBJS): $(objroot)src/%.jet.$(O): $(srcroot)src/%.c
-$(C_JET_OBJS): CFLAGS += -DJEMALLOC_JET
+$(C_JET_OBJS): CPPFLAGS += -DJEMALLOC_JET
 $(C_TESTLIB_UNIT_OBJS): $(objroot)test/src/%.unit.$(O): $(srcroot)test/src/%.c
 $(C_TESTLIB_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
 $(C_TESTLIB_INTEGRATION_OBJS): $(objroot)test/src/%.integration.$(O): $(srcroot)test/src/%.c
 $(C_TESTLIB_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(C_UTIL_INTEGRATION_OBJS): $(objroot)src/%.integration.$(O): $(srcroot)src/%.c
+$(C_TESTLIB_ANALYZE_OBJS): $(objroot)test/src/%.analyze.$(O): $(srcroot)test/src/%.c
+$(C_TESTLIB_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
 $(C_TESTLIB_STRESS_OBJS): $(objroot)test/src/%.stress.$(O): $(srcroot)test/src/%.c
 $(C_TESTLIB_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST -DJEMALLOC_STRESS_TESTLIB
 $(C_TESTLIB_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_UNIT_OBJS): CPPFLAGS += -DJEMALLOC_UNIT_TEST
 $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
+$(TESTS_INTEGRATION_CPP_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_CPP_TEST
+$(TESTS_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
+$(TESTS_STRESS_CPP_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_CPP_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
+$(TESTS_CPP_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.cpp
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_CPP_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_OBJS): CFLAGS += -fno-builtin
+$(TESTS_CPP_OBJS): CPPFLAGS += -fno-builtin
 ifneq ($(IMPORTLIB),$(SO))
-$(C_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
+$(CPP_OBJS) $(C_SYM_OBJS) $(C_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif

-ifndef CC_MM
 # Dependencies.
+ifndef CC_MM
 HEADER_DIRS = $(srcroot)include/jemalloc/internal \
 	$(objroot)include/jemalloc $(objroot)include/jemalloc/internal
-HEADERS = $(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h))
-$(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): $(HEADERS)
-$(TESTS_OBJS): $(objroot)test/include/test/jemalloc_test.h
+HEADERS = $(filter-out $(PRIVATE_NAMESPACE_HDRS),$(wildcard $(foreach dir,$(HEADER_DIRS),$(dir)/*.h)))
+$(C_SYM_OBJS) $(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS) $(TESTS_CPP_OBJS): $(HEADERS)
+$(TESTS_OBJS) $(TESTS_CPP_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif

-$(C_OBJS) $(C_PIC_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
+$(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_INTEGRATION_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace.h
+$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS) $(TESTS_STRESS_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h
+
+$(C_SYM_OBJS) $(C_OBJS) $(C_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
 	@mkdir -p $(@D)
 	$(CC) $(CFLAGS) -c $(CPPFLAGS) $(CTARGET) $<
 ifdef CC_MM
 	@$(CC) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $<
 endif

+$(C_SYMS): %.sym:
+	@mkdir -p $(@D)
+	$(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols.awk > $@
+
+$(C_JET_SYMS): %.sym:
+	@mkdir -p $(@D)
+	$(DUMP_SYMS) $< | $(AWK) -f $(objroot)include/jemalloc/internal/private_symbols_jet.awk > $@
+
+$(objroot)include/jemalloc/internal/private_namespace.gen.h: $(C_SYMS)
+	$(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@
+
+$(objroot)include/jemalloc/internal/private_namespace_jet.gen.h: $(C_JET_SYMS)
+	$(SHELL) $(srcroot)include/jemalloc/internal/private_namespace.sh $^ > $@
+
+%.h: %.gen.h
+	@if ! `cmp -s $< $@` ; then echo "cp $< $@"; cp $< $@ ; fi
+
+$(CPP_OBJS) $(CPP_PIC_OBJS) $(TESTS_CPP_OBJS): %.$(O):
+	@mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) -c $(CPPFLAGS) $(CTARGET) $<
+ifdef CC_MM
+	@$(CXX) -MM $(CPPFLAGS) -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
 ifneq ($(SOREV),$(SO))
 %.$(SO) : %.$(SOREV)
 	@mkdir -p $(@D)
 	ln -sf $(<F) $@
 endif

-$(objroot)lib/$(LIBJEMALLOC).$(SOREV) : $(if $(PIC_CFLAGS),$(C_PIC_OBJS),$(C_OBJS))
+$(objroot)lib/$(LIBJEMALLOC).$(SOREV) : $(if $(PIC_CFLAGS),$(C_PIC_OBJS),$(C_OBJS)) $(if $(PIC_CFLAGS),$(CPP_PIC_OBJS),$(CPP_OBJS))
 	@mkdir -p $(@D)
+ifeq (@enable_cxx@, 1)
+	$(CXX) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+else
 	$(CC) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+endif

-$(objroot)lib/$(LIBJEMALLOC)_pic.$(A) : $(C_PIC_OBJS)
-$(objroot)lib/$(LIBJEMALLOC).$(A) : $(C_OBJS)
-$(objroot)lib/$(LIBJEMALLOC)_s.$(A) : $(C_OBJS)
+$(objroot)lib/$(LIBJEMALLOC)_pic.$(A) : $(C_PIC_OBJS) $(CPP_PIC_OBJS)
+$(objroot)lib/$(LIBJEMALLOC).$(A) : $(C_OBJS) $(CPP_OBJS)
+$(objroot)lib/$(LIBJEMALLOC)_s.$(A) : $(C_OBJS) $(CPP_OBJS)

 $(STATIC_LIBS):
 	@mkdir -p $(@D)
 	$(AR) $(ARFLAGS)@AROUT@ $+

-$(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(TESTS_UNIT_LINK_OBJS) $(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS)
+$(objroot)test/unit/%$(EXE): $(objroot)test/unit/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS)
 	@mkdir -p $(@D)
 	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

 $(objroot)test/integration/%$(EXE): $(objroot)test/integration/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
-	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -lpthread,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
+	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LJEMALLOC) $(LDFLAGS) $(filter-out -lm,$(filter -lrt -pthread -lstdc++,$(LIBS))) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/integration/cpp/%$(EXE): $(objroot)test/integration/cpp/%.$(O) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
+	@mkdir -p $(@D)
+	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) -lm $(EXTRA_LDFLAGS)
+
+$(objroot)test/analyze/%$(EXE): $(objroot)test/analyze/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_ANALYZE_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

 $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB)
 	@mkdir -p $(@D)
 	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

+$(objroot)test/stress/pa/pa_data_preprocessor$(EXE): $(objroot)test/stress/pa/pa_data_preprocessor.$(O)
+	@mkdir -p $(@D)
+	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/stress/pa/pa_microbench$(EXE): $(objroot)test/stress/pa/pa_microbench.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
+ifdef CC_MM
+	@$(CC) -MM $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
+$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.cpp
+	@mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) -c $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
+ifdef CC_MM
+	@$(CXX) -MM $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
 build_lib_shared: $(DSOS)
 build_lib_static: $(STATIC_LIBS)
-build_lib: build_lib_shared build_lib_static
+ifeq ($(enable_shared), 1)
+build_lib: build_lib_shared
+endif
+ifeq ($(enable_static), 1)
+build_lib: build_lib_static
+endif

 install_bin:
 	$(INSTALL) -d $(BINDIR)
@ -359,16 +643,22 @@ install_lib_pc: $(PC)
 	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done

-install_lib: install_lib_shared install_lib_static install_lib_pc
+ifeq ($(enable_shared), 1)
+install_lib: install_lib_shared
+endif
+ifeq ($(enable_static), 1)
+install_lib: install_lib_static
+endif
+install_lib: install_lib_pc

-install_doc_html:
+install_doc_html: build_doc_html
 	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
 	@for d in $(DOCS_HTML); do \
 	echo "$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix)"; \
 	$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
 done

-install_doc_man:
+install_doc_man: build_doc_man
 	$(INSTALL) -d $(MANDIR)/man3
 	@for d in $(DOCS_MAN3); do \
 	echo "$(INSTALL) -m 644 $$d $(MANDIR)/man3"; \
@ -377,94 +667,124 @@ done

 install_doc: install_doc_html install_doc_man

-install: install_bin install_include install_lib install_doc
+install: install_bin install_include install_lib
+
+ifeq ($(enable_doc), 1)
+install: install_doc
+endif
+
+uninstall_bin:
+	$(RM) -v $(foreach b,$(notdir $(BINS)),$(BINDIR)/$(b))
+
+uninstall_include:
+	$(RM) -v $(foreach h,$(notdir $(C_HDRS)),$(INCLUDEDIR)/jemalloc/$(h))
+	rmdir -v $(INCLUDEDIR)/jemalloc
+
+uninstall_lib_shared:
+	$(RM) -v $(LIBDIR)/$(LIBJEMALLOC).$(SOREV)
+ifneq ($(SOREV),$(SO))
+	$(RM) -v $(LIBDIR)/$(LIBJEMALLOC).$(SO)
+endif
+
+uninstall_lib_static:
+	$(RM) -v $(foreach l,$(notdir $(STATIC_LIBS)),$(LIBDIR)/$(l))
+
+uninstall_lib_pc:
+	$(RM) -v $(foreach p,$(notdir $(PC)),$(LIBDIR)/pkgconfig/$(p))
+
+ifeq ($(enable_shared), 1)
+uninstall_lib: uninstall_lib_shared
+endif
+ifeq ($(enable_static), 1)
+uninstall_lib: uninstall_lib_static
+endif
+uninstall_lib: uninstall_lib_pc
+
+uninstall_doc_html:
+	$(RM) -v $(foreach d,$(notdir $(DOCS_HTML)),$(DATADIR)/doc/jemalloc$(install_suffix)/$(d))
+	rmdir -v $(DATADIR)/doc/jemalloc$(install_suffix)
+
+uninstall_doc_man:
+	$(RM) -v $(foreach d,$(notdir $(DOCS_MAN3)),$(MANDIR)/man3/$(d))
+
+uninstall_doc: uninstall_doc_html uninstall_doc_man
+
+uninstall: uninstall_bin uninstall_include uninstall_lib
+
+ifeq ($(enable_doc), 1)
+uninstall: uninstall_doc
+endif

 tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
-tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE))
-tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE))
-tests: tests_unit tests_integration tests_stress
+tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
+tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
+tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
+tests_pa: $(objroot)test/stress/pa/pa_data_preprocessor$(EXE) $(objroot)test/stress/pa/pa_microbench$(EXE)
+tests: tests_unit tests_integration tests_analyze tests_stress

 check_unit_dir:
 	@mkdir -p $(objroot)test/unit
 check_integration_dir:
 	@mkdir -p $(objroot)test/integration
+analyze_dir:
+	@mkdir -p $(objroot)test/analyze
 stress_dir:
 	@mkdir -p $(objroot)test/stress
 check_dir: check_unit_dir check_integration_dir

 check_unit: tests_unit check_unit_dir
-	$(MALLOC_CONF)="purge:ratio" $(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
-	$(MALLOC_CONF)="purge:decay" $(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
+	$(SHELL) $(objroot)test/test.sh $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%)
 check_integration_prof: tests_integration check_integration_dir
 ifeq ($(enable_prof), 1)
-	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
-	$(MALLOC_CONF)="prof:true,prof_active:false" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+	$(MALLOC_CONF)="prof:true,prof_active:false" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
 endif
 check_integration_decay: tests_integration check_integration_dir
-	$(MALLOC_CONF)="purge:decay,decay_time:-1" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
-	$(MALLOC_CONF)="purge:decay,decay_time:0" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
-	$(MALLOC_CONF)="purge:decay" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+	$(MALLOC_CONF)="dirty_decay_ms:-1,muzzy_decay_ms:-1" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+	$(MALLOC_CONF)="dirty_decay_ms:0,muzzy_decay_ms:0" $(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
 check_integration: tests_integration check_integration_dir
-	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%)
+	$(SHELL) $(objroot)test/test.sh $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%)
+analyze: tests_analyze analyze_dir
+ifeq ($(enable_prof), 1)
+	$(MALLOC_CONF)="prof:true" $(SHELL) $(objroot)test/test.sh $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%)
+else
+	$(SHELL) $(objroot)test/test.sh $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%)
+endif
 stress: tests_stress stress_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
+	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%)
 check: check_unit check_integration check_integration_decay check_integration_prof

-ifeq ($(enable_code_coverage), 1)
-coverage_unit: check_unit
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src jet $(C_JET_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src unit $(C_TESTLIB_UNIT_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/unit unit $(TESTS_UNIT_OBJS)
-
-coverage_integration: check_integration
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src pic $(C_PIC_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src integration $(C_UTIL_INTEGRATION_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src integration $(C_TESTLIB_INTEGRATION_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/integration integration $(TESTS_INTEGRATION_OBJS)
-
-coverage_stress: stress
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src pic $(C_PIC_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src jet $(C_JET_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src stress $(C_TESTLIB_STRESS_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/stress stress $(TESTS_STRESS_OBJS)
-
-coverage: check
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src pic $(C_PIC_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src jet $(C_JET_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)src integration $(C_UTIL_INTEGRATION_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src unit $(C_TESTLIB_UNIT_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src integration $(C_TESTLIB_INTEGRATION_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/src stress $(C_TESTLIB_STRESS_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/unit unit $(TESTS_UNIT_OBJS) $(TESTS_UNIT_AUX_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/integration integration $(TESTS_INTEGRATION_OBJS)
-	$(SHELL) $(srcroot)coverage.sh $(srcroot)test/stress integration $(TESTS_STRESS_OBJS)
-endif
-
 clean:
+	rm -f $(PRIVATE_NAMESPACE_HDRS)
+	rm -f $(PRIVATE_NAMESPACE_GEN_HDRS)
+	rm -f $(C_SYM_OBJS)
+	rm -f $(C_SYMS)
 	rm -f $(C_OBJS)
+	rm -f $(CPP_OBJS)
 	rm -f $(C_PIC_OBJS)
+	rm -f $(CPP_PIC_OBJS)
+	rm -f $(C_JET_SYM_OBJS)
+	rm -f $(C_JET_SYMS)
 	rm -f $(C_JET_OBJS)
 	rm -f $(C_TESTLIB_OBJS)
+	rm -f $(C_SYM_OBJS:%.$(O)=%.d)
 	rm -f $(C_OBJS:%.$(O)=%.d)
-	rm -f $(C_OBJS:%.$(O)=%.gcda)
-	rm -f $(C_OBJS:%.$(O)=%.gcno)
+	rm -f $(CPP_OBJS:%.$(O)=%.d)
 	rm -f $(C_PIC_OBJS:%.$(O)=%.d)
-	rm -f $(C_PIC_OBJS:%.$(O)=%.gcda)
-	rm -f $(C_PIC_OBJS:%.$(O)=%.gcno)
+	rm -f $(CPP_PIC_OBJS:%.$(O)=%.d)
+	rm -f $(C_JET_SYM_OBJS:%.$(O)=%.d)
 	rm -f $(C_JET_OBJS:%.$(O)=%.d)
-	rm -f $(C_JET_OBJS:%.$(O)=%.gcda)
-	rm -f $(C_JET_OBJS:%.$(O)=%.gcno)
 	rm -f $(C_TESTLIB_OBJS:%.$(O)=%.d)
-	rm -f $(C_TESTLIB_OBJS:%.$(O)=%.gcda)
-	rm -f $(C_TESTLIB_OBJS:%.$(O)=%.gcno)
 	rm -f $(TESTS_OBJS:%.$(O)=%$(EXE))
 	rm -f $(TESTS_OBJS)
 	rm -f $(TESTS_OBJS:%.$(O)=%.d)
-	rm -f $(TESTS_OBJS:%.$(O)=%.gcda)
-	rm -f $(TESTS_OBJS:%.$(O)=%.gcno)
 	rm -f $(TESTS_OBJS:%.$(O)=%.out)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%$(EXE))
+	rm -f $(TESTS_CPP_OBJS)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%.d)
+	rm -f $(TESTS_CPP_OBJS:%.$(O)=%.out)
 	rm -f $(DSOS) $(STATIC_LIBS)
-	rm -f $(objroot)*.gcov.*

 distclean: clean
 	rm -f $(objroot)bin/jemalloc-config
--- a/14
+++ b/14
@ -3,12 +3,12 @@ fragmentation avoidance and scalable concurrency support.  jemalloc first came
 into use as the FreeBSD libc allocator in 2005, and since then it has found its
 way into numerous applications that rely on its predictable behavior.  In 2010
 jemalloc development efforts broadened to include developer support features
-such as heap profiling, Valgrind integration, and extensive monitoring/tuning
-hooks.  Modern jemalloc releases continue to be integrated back into FreeBSD,
-and therefore versatility remains critical.  Ongoing development efforts trend
-toward making jemalloc among the best allocators for a broad range of demanding
-applications, and eliminating/mitigating weaknesses that have practical
-repercussions for real world applications.
+such as heap profiling and extensive monitoring/tuning hooks.  Modern jemalloc
+releases continue to be integrated back into FreeBSD, and therefore versatility
+remains critical.  Ongoing development efforts trend toward making jemalloc
+among the best allocators for a broad range of demanding applications, and
+eliminating/mitigating weaknesses that have practical repercussions for real
+world applications.

 The COPYING file contains copyright and licensing information.

@ -17,4 +17,4 @@ jemalloc.

 The ChangeLog file contains a brief summary of changes for each release.

-URL: http://jemalloc.net/
+URL: https://jemalloc.net/
--- a/TUNING.md
+++ b/TUNING.md
@ -0,0 +1,129 @@
+This document summarizes the common approaches for performance fine tuning with
+jemalloc (as of 5.3.0).  The default configuration of jemalloc tends to work
+reasonably well in practice, and most applications should not have to tune any
+options. However, in order to cover a wide range of applications and avoid
+pathological cases, the default setting is sometimes kept conservative and
+suboptimal, even for many common workloads.  When jemalloc is properly tuned for
+a specific application / workload, it is common to improve system level metrics
+by a few percent, or make favorable trade-offs.
+
+
+## Notable runtime options for performance tuning
+
+Runtime options can be set via
+[malloc_conf](https://jemalloc.net/jemalloc.3.html#tuning).
+
+* [background_thread](https://jemalloc.net/jemalloc.3.html#background_thread)
+
+    Enabling jemalloc background threads generally improves the tail latency for
+    application threads, since unused memory purging is shifted to the dedicated
+    background threads.  In addition, unintended purging delay caused by
+    application inactivity is avoided with background threads.
+
+    Suggested: `background_thread:true` when jemalloc managed threads can be
+    allowed.
+
+* [metadata_thp](https://jemalloc.net/jemalloc.3.html#opt.metadata_thp)
+
+    Allowing jemalloc to utilize transparent huge pages for its internal
+    metadata usually reduces TLB misses significantly, especially for programs
+    with large memory footprint and frequent allocation / deallocation
+    activities.  Metadata memory usage may increase due to the use of huge
+    pages.
+
+    Suggested for allocation intensive programs: `metadata_thp:auto` or
+    `metadata_thp:always`, which is expected to improve CPU utilization at a
+    small memory cost.
+
+* [dirty_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms) and
+  [muzzy_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.muzzy_decay_ms)
+
+    Decay time determines how fast jemalloc returns unused pages back to the
+    operating system, and therefore provides a fairly straightforward trade-off
+    between CPU and memory usage.  Shorter decay time purges unused pages faster
+    to reduces memory usage (usually at the cost of more CPU cycles spent on
+    purging), and vice versa.
+
+    Suggested: tune the values based on the desired trade-offs.
+
+* [narenas](https://jemalloc.net/jemalloc.3.html#opt.narenas)
+
+    By default jemalloc uses multiple arenas to reduce internal lock contention.
+    However high arena count may also increase overall memory fragmentation,
+    since arenas manage memory independently.  When high degree of parallelism
+    is not expected at the allocator level, lower number of arenas often
+    improves memory usage.
+
+    Suggested: if low parallelism is expected, try lower arena count while
+    monitoring CPU and memory usage.
+
+* [percpu_arena](https://jemalloc.net/jemalloc.3.html#opt.percpu_arena)
+
+    Enable dynamic thread to arena association based on running CPU.  This has
+    the potential to improve locality, e.g. when thread to CPU affinity is
+    present.
+    
+    Suggested: try `percpu_arena:percpu` or `percpu_arena:phycpu` if
+    thread migration between processors is expected to be infrequent.
+
+Examples:
+
+* High resource consumption application, prioritizing CPU utilization:
+
+    `background_thread:true,metadata_thp:auto` combined with relaxed decay time
+    (increased `dirty_decay_ms` and / or `muzzy_decay_ms`,
+    e.g. `dirty_decay_ms:30000,muzzy_decay_ms:30000`).
+
+* High resource consumption application, prioritizing memory usage:
+
+    `background_thread:true,tcache_max:4096` combined with shorter decay time
+    (decreased `dirty_decay_ms` and / or `muzzy_decay_ms`,
+    e.g. `dirty_decay_ms:5000,muzzy_decay_ms:5000`), and lower arena count
+    (e.g. number of CPUs).
+
+* Low resource consumption application:
+
+    `narenas:1,tcache_max:1024` combined with shorter decay time (decreased
+    `dirty_decay_ms` and / or `muzzy_decay_ms`,e.g.
+    `dirty_decay_ms:1000,muzzy_decay_ms:0`).
+
+* Extremely conservative -- minimize memory usage at all costs, only suitable when
+allocation activity is very rare:
+
+    `narenas:1,tcache:false,dirty_decay_ms:0,muzzy_decay_ms:0`
+
+Note that it is recommended to combine the options with `abort_conf:true` which
+aborts immediately on illegal options.
+
+## Beyond runtime options
+
+In addition to the runtime options, there are a number of programmatic ways to
+improve application performance with jemalloc.
+
+* [Explicit arenas](https://jemalloc.net/jemalloc.3.html#arenas.create)
+
+    Manually created arenas can help performance in various ways, e.g. by
+    managing locality and contention for specific usages.  For example,
+    applications can explicitly allocate frequently accessed objects from a
+    dedicated arena with
+    [mallocx()](https://jemalloc.net/jemalloc.3.html#MALLOCX_ARENA) to improve
+    locality.  In addition, explicit arenas often benefit from individually
+    tuned options, e.g. relaxed [decay
+    time](https://jemalloc.net/jemalloc.3.html#arena.i.dirty_decay_ms) if
+    frequent reuse is expected.
+
+* [Extent hooks](https://jemalloc.net/jemalloc.3.html#arena.i.extent_hooks)
+
+    Extent hooks allow customization for managing underlying memory.  One use
+    case for performance purpose is to utilize huge pages -- for example,
+    [HHVM](httpss://github.com/facebook/hhvm/blob/master/hphp/util/alloc.cpp)
+    uses explicit arenas with customized extent hooks to manage 1GB huge pages
+    for frequently accessed data, which reduces TLB misses significantly.
+
+* [Explicit thread-to-arena
+  binding](https://jemalloc.net/jemalloc.3.html#thread.arena)
+
+    It is common for some threads in an application to have different memory
+    access / allocation patterns.  Threads with heavy workloads often benefit
+    from explicit binding, e.g. binding very active threads to dedicated arenas
+    may reduce contention at the allocator level.
--- a/autogen.sh
+++ b/autogen.sh
@ -9,8 +9,8 @@ for i in autoconf; do
    fi
 done

-echo "./configure --enable-autogen $@"
-./configure --enable-autogen $@
+echo "./configure --enable-autogen \"$@\""
+./configure --enable-autogen "$@"
 if [ $? -ne 0 ]; then
    echo "Error $? in ./configure"
    exit 1
--- a/bin/jemalloc-config.in
+++ b/bin/jemalloc-config.in
@ -18,6 +18,7 @@ Options:
  --cc         : Print compiler used to build jemalloc.
  --cflags     : Print compiler flags used to build jemalloc.
  --cppflags   : Print preprocessor flags used to build jemalloc.
+  --cxxflags   : Print C++ compiler flags used to build jemalloc.
  --ldflags    : Print library flags used to build jemalloc.
  --libs       : Print libraries jemalloc was linked against.
 EOF
@ -67,6 +68,9 @@ case "$1" in
 --cppflags)
 	echo "@CPPFLAGS@"
 	;;
+--cxxflags)
+	echo "@CXXFLAGS@"
+	;;
 --ldflags)
 	echo "@LDFLAGS@ @EXTRA_LDFLAGS@"
 	;;
--- a/bin/jeprof.in
+++ b/bin/jeprof.in
@ -71,6 +71,7 @@
 use strict;
 use warnings;
 use Getopt::Long;
+use Cwd;

 my $JEPROF_VERSION = "@jemalloc_version@";
 my $PPROF_VERSION = "2.0";
@ -87,6 +88,7 @@ my %obj_tool_map = (
  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
  #"otool" => "otool",         # equivalent of objdump on OS X
+  #"dyld_info" => "dyld_info",   # equivalent of otool on OS X for shared cache
 );
 # NOTE: these are lists, so you can put in commandline flags if you want.
 my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
@ -204,6 +206,8 @@ Output type:
   --svg               Generate SVG to stdout
   --gif               Generate GIF to stdout
   --raw               Generate symbolized jeprof data (useful with remote fetch)
+   --collapsed         Generate collapsed stacks for building flame graphs
+                       (see http://www.brendangregg.com/flamegraphs.html)

 Heap-Profile Options:
   --inuse_space       Display in-use (mega)bytes [default]
@ -237,6 +241,7 @@ Miscellaneous:
   --test              Run unit tests
   --help              This message
   --version           Version information
+   --debug-syms-by-id  (Linux only) Find debug symbol files by build ID as well as by name

 Environment Variables:
   JEPROF_TMPDIR        Profiles directory. Defaults to \$HOME/jeprof
@ -331,6 +336,7 @@ sub Init() {
  $main::opt_gif = 0;
  $main::opt_svg = 0;
  $main::opt_raw = 0;
+  $main::opt_collapsed = 0;

  $main::opt_nodecount = 80;
  $main::opt_nodefraction = 0.005;
@ -361,6 +367,7 @@ sub Init() {
  $main::opt_tools   = "";
  $main::opt_debug   = 0;
  $main::opt_test    = 0;
+  $main::opt_debug_syms_by_id = 0;

  # These are undocumented flags used only by unittests.
  $main::opt_test_stride = 0;
@ -404,6 +411,7 @@ sub Init() {
             "svg!"           => \$main::opt_svg,
             "gif!"           => \$main::opt_gif,
             "raw!"           => \$main::opt_raw,
+             "collapsed!"     => \$main::opt_collapsed,
             "interactive!"   => \$main::opt_interactive,
             "nodecount=i"    => \$main::opt_nodecount,
             "nodefraction=f" => \$main::opt_nodefraction,
@ -428,6 +436,7 @@ sub Init() {
             "tools=s"        => \$main::opt_tools,
             "test!"          => \$main::opt_test,
             "debug!"         => \$main::opt_debug,
+             "debug-syms-by-id!" => \$main::opt_debug_syms_by_id,
             # Undocumented flags used only by unittests:
             "test_stride=i"  => \$main::opt_test_stride,
      ) || usage("Invalid option(s)");
@ -489,6 +498,7 @@ sub Init() {
      $main::opt_svg +
      $main::opt_gif +
      $main::opt_raw +
+      $main::opt_collapsed +
      $main::opt_interactive +
      0;
  if ($modes > 1) {
@ -571,6 +581,11 @@ sub Init() {
  foreach (@prefix_list) {
    s|/+$||;
  }
+
+  # Flag to prevent us from trying over and over to use
+  #  elfutils if it's not installed (used only with
+  #  --debug-syms-by-id option).
+  $main::gave_up_on_elfutils = 0;
 }

 sub FilterAndPrint {
@ -620,6 +635,8 @@ sub FilterAndPrint {
      PrintText($symbols, $flat, $cumulative, -1);
    } elsif ($main::opt_raw) {
      PrintSymbolizedProfile($symbols, $profile, $main::prog);
+    } elsif ($main::opt_collapsed) {
+      PrintCollapsedStacks($symbols, $profile);
    } elsif ($main::opt_callgrind) {
      PrintCallgrind($calls);
    } else {
@ -672,15 +689,15 @@ sub Main() {
  my $symbol_map = {};

  # Read one profile, pick the last item on the list
-  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $data = ReadProfile($main::prog, $main::profile_files[0]);
  my $profile = $data->{profile};
  my $pcs = $data->{pcs};
  my $libs = $data->{libs};   # Info about main program and shared libraries
  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});

  # Add additional profiles, if available.
-  if (scalar(@main::profile_files) > 0) {
-    foreach my $pname (@main::profile_files) {
+  if (scalar(@main::profile_files) > 1) {
+    foreach my $pname (@main::profile_files[1..$#main::profile_files]) {
      my $data2 = ReadProfile($main::prog, $pname);
      $profile = AddProfile($profile, $data2->{profile});
      $pcs = AddPcs($pcs, $data2->{pcs});
@ -2809,6 +2826,40 @@ sub IsSecondPcAlwaysTheSame {
  return $second_pc;
 }

+sub ExtractSymbolNameInlineStack {
+  my $symbols = shift;
+  my $address = shift;
+
+  my @stack = ();
+
+  if (exists $symbols->{$address}) {
+    my @localinlinestack = @{$symbols->{$address}};
+    for (my $i = $#localinlinestack; $i > 0; $i-=3) {
+      my $file = $localinlinestack[$i-1];
+      my $fn = $localinlinestack[$i-0];
+
+      if ($file eq "?" || $file eq ":0") {
+        $file = "??:0";
+      }
+      if ($fn eq '??') {
+        # If we can't get the symbol name, at least use the file information.
+        $fn = $file;
+      }
+      my $suffix = "[inline]";
+      if ($i == 2) {
+        $suffix = "";
+      }
+      push (@stack, $fn.$suffix);
+    }
+  }
+  else {
+    # If we can't get a symbol name, at least fill in the address.
+    push (@stack, $address);
+  }
+
+  return @stack;
+}
+
 sub ExtractSymbolLocation {
  my $symbols = shift;
  my $address = shift;
@ -2883,6 +2934,17 @@ sub FilterFrames {
  return $result;
 }

+sub PrintCollapsedStacks {
+  my $symbols = shift;
+  my $profile = shift;
+
+  while (my ($stack_trace, $count) = each %$profile) {
+    my @address = split(/\n/, $stack_trace);
+    my @names = reverse ( map { ExtractSymbolNameInlineStack($symbols, $_) } @address );
+    printf("%s %d\n", join(";", @names), $count);
+  }
+}
+
 sub RemoveUninterestingFrames {
  my $symbols = shift;
  my $profile = shift;
@ -2891,21 +2953,46 @@ sub RemoveUninterestingFrames {
  my %skip = ();
  my $skip_regexp = 'NOMATCH';
  if ($main::profile_type eq 'heap' || $main::profile_type eq 'growth') {
-    foreach my $name ('calloc',
+    foreach my $name ('@JEMALLOC_PREFIX@calloc',
                      'cfree',
-                      'malloc',
-                      'free',
-                      'memalign',
-                      'posix_memalign',
-                      'aligned_alloc',
+                      '@JEMALLOC_PREFIX@malloc',
+                      'je_malloc_default',
+                      'newImpl',
+                      'void* newImpl',
+                      'fallbackNewImpl',
+                      'void* fallbackNewImpl',
+                      'fallback_impl',
+                      'void* fallback_impl',
+                      'imalloc',
+                      'int imalloc',
+                      'imalloc_body',
+                      'int imalloc_body',
+                      'prof_alloc_prep',
+                      'prof_tctx_t *prof_alloc_prep',
+                      'prof_backtrace_impl',
+                      'void prof_backtrace_impl',
+                      'je_prof_backtrace',
+                      'void je_prof_backtrace',
+                      'je_prof_tctx_create',
+                      'prof_tctx_t* prof_tctx_create',
+                      '@JEMALLOC_PREFIX@free',
+                      '@JEMALLOC_PREFIX@memalign',
+                      '@JEMALLOC_PREFIX@posix_memalign',
+                      '@JEMALLOC_PREFIX@aligned_alloc',
                      'pvalloc',
-                      'valloc',
-                      'realloc',
-                      'mallocx', # jemalloc
-                      'rallocx', # jemalloc
-                      'xallocx', # jemalloc
-                      'dallocx', # jemalloc
-                      'sdallocx', # jemalloc
+                      '@JEMALLOC_PREFIX@valloc',
+                      '@JEMALLOC_PREFIX@realloc',
+                      '@JEMALLOC_PREFIX@mallocx',
+                      'irallocx_prof',
+                      'void *irallocx_prof',
+                      '@JEMALLOC_PREFIX@rallocx',
+                      'do_rallocx',
+                      'ixallocx_prof',
+                      'size_t ixallocx_prof',
+                      '@JEMALLOC_PREFIX@xallocx',
+                      '@JEMALLOC_PREFIX@dallocx',
+                      '@JEMALLOC_PREFIX@sdallocx',
+                      '@JEMALLOC_PREFIX@sdallocx_noflags',
                      'tc_calloc',
                      'tc_cfree',
                      'tc_malloc',
@ -3014,6 +3101,8 @@ sub RemoveUninterestingFrames {
    foreach my $a (@addrs) {
      if (exists($symbols->{$a})) {
        my $func = $symbols->{$a}->[0];
+        # Remove suffix in the symbols following space when filtering.
+        $func =~ s/ .*//;
        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
          # Throw away the portion of the backtrace seen so far, under the
          # assumption that previous frames were for functions internal to the
@ -4436,16 +4525,54 @@ sub FindLibrary {
 # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
 sub DebuggingLibrary {
  my $file = shift;
-  if ($file =~ m|^/|) {
-      if (-f "/usr/lib/debug$file") {
-        return "/usr/lib/debug$file";
-      } elsif (-f "/usr/lib/debug$file.debug") {
-        return "/usr/lib/debug$file.debug";
-      }
+
+  if ($file !~ m|^/|) {
+    return undef;
  }
+
+  # Find debug symbol file if it's named after the library's name.
+
+  if (-f "/usr/lib/debug$file") {
+    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; }
+    return "/usr/lib/debug$file";
+  } elsif (-f "/usr/lib/debug$file.debug") {
+    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; }
+    return "/usr/lib/debug$file.debug";
+  }
+
+  if(!$main::opt_debug_syms_by_id) {
+    if($main::opt_debug) { print STDERR "no debug symbols found for $file\n" };
+    return undef;
+  }
+
+  # Find debug file if it's named after the library's build ID.
+
+  my $readelf = '';
+  if (!$main::gave_up_on_elfutils) {
+    $readelf = qx/eu-readelf -n ${file}/;
+    if ($?) {
+      print STDERR "Cannot run eu-readelf. To use --debug-syms-by-id you must be on Linux, with elfutils installed.\n";
+      $main::gave_up_on_elfutils = 1;
+      return undef;
+    }
+    my $buildID = $1 if $readelf =~ /Build ID: ([A-Fa-f0-9]+)/s;
+    if (defined $buildID && length $buildID > 0) {
+      my $symbolFile = '/usr/lib/debug/.build-id/' . substr($buildID, 0, 2) . '/' . substr($buildID, 2) . '.debug';
+      if (-e $symbolFile) {
+        if($main::opt_debug) { print STDERR "found debug symbol file $symbolFile for $file\n" };
+        return $symbolFile;
+      } else {
+        if($main::opt_debug) { print STDERR "no debug symbol file found for $file, build ID: $buildID\n" };
+        return undef;
+      }
+    }
+  }
+
+  if($main::opt_debug) { print STDERR "no debug symbols found for $file, build ID unknown\n" };
  return undef;
 }

+
 # Parse text section header of a library using objdump
 sub ParseTextSectionHeaderFromObjdump {
  my $lib = shift;
@ -4555,7 +4682,65 @@ sub ParseTextSectionHeaderFromOtool {
  return $r;
 }

+# Parse text section header of a library in OS X shared cache using dyld_info
+sub ParseTextSectionHeaderFromDyldInfo {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma;
+  my $file_offset;
+  # Get dyld_info output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $cmd = ShellEscape($obj_tool_map{"dyld_info"}, "-segments", $lib);
+  open(DYLD, "$cmd |") || error("$cmd: $!\n");
+
+  while (<DYLD>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # -segments:
+    #    load-address    segment section        sect-size  seg-size perm
+    #     0x1803E0000    __TEXT                                   112KB r.x
+    #     0x1803E4F34             __text            80960
+    #     0x1803F8B74             __auth_stubs        768
+    #     0x1803F8E74             __init_offsets        4
+    #     0x1803F8E78             __gcc_except_tab   1180
+    my @x = split;
+    if ($#x >= 2) {
+      if ($x[0] eq 'load-offset') {
+        # dyld_info should only be used for the shared lib.
+        return undef;
+      } elsif ($x[1] eq '__TEXT') {
+        $file_offset = $x[0];
+      } elsif ($x[1] eq '__text') {
+        $size = $x[2];
+        $vma = $x[0];
+        $file_offset = AddressSub($x[0], $file_offset);
+        last;
+      }
+    }
+  }
+  close(DYLD);
+
+  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
+     return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
 sub ParseTextSectionHeader {
+  # obj_tool_map("dyld_info") is only defined if we're in a Mach-O environment
+  if (defined($obj_tool_map{"dyld_info"})) {
+    my $r = ParseTextSectionHeaderFromDyldInfo(@_);
+    if (defined($r)){
+      return $r;
+    }
+  }
+  # if dyld_info doesn't work, or we don't have it, fall back to otool
  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
  if (defined($obj_tool_map{"otool"})) {
    my $r = ParseTextSectionHeaderFromOtool(@_);
@ -4570,7 +4755,7 @@ sub ParseTextSectionHeader {
 # Split /proc/pid/maps dump into a list of libraries
 sub ParseLibraries {
  return if $main::use_symbol_page;  # We don't need libraries info.
-  my $prog = shift;
+  my $prog = Cwd::abs_path(shift);
  my $map = shift;
  my $pcs = shift;

@ -4596,13 +4781,32 @@ sub ParseLibraries {
      $offset = HexExtend($3);
      $lib = $4;
      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
-    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
+    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.(so|dll|dylib|bundle)(\.\d+)*)/) {
      # Cooked line from DumpAddressMap.  Example:
      #   40000000-40015000: /lib/ld-2.3.2.so
      $start = HexExtend($1);
      $finish = HexExtend($2);
      $offset = $zero_offset;
      $lib = $3;
+    } elsif (($l =~ /^($h)-($h)\s+..x.\s+($h)\s+\S+:\S+\s+\d+\s+(\S+)$/i) && ($4 eq $prog)) {
+      # PIEs and address space randomization do not play well with our
+      # default assumption that main executable is at lowest
+      # addresses. So we're detecting main executable in
+      # /proc/self/maps as well.
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = HexExtend($3);
+      $lib = $4;
+      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } elsif (($l =~ /^\s*($h)-($h):\s*(\S+)/) && ($3 eq $prog)) {
+      # PIEs and address space randomization do not play well with our
+      # default assumption that main executable is at lowest
+      # addresses. So we're detecting main executable from
+      # DumpAddressMap as well.
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = $3;
    }
    # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in
    # function procfs_doprocmap (sys/fs/procfs/procfs_map.c)
@ -4973,7 +5177,7 @@ sub MapToSymbols {
      } else {
 	# MapSymbolsWithNM tags each routine with its starting address,
 	# useful in case the image has multiple occurrences of this
-	# routine.  (It uses a syntax that resembles template paramters,
+	# routine.  (It uses a syntax that resembles template parameters,
 	# that are automatically stripped out by ShortFunctionName().)
 	# addr2line does not provide the same information.  So we check
 	# if nm disambiguated our symbol, and if so take the annotated
@ -5133,6 +5337,7 @@ sub ConfigureObjTools {
  if ($file_type =~ /Mach-O/) {
    # OS X uses otool to examine Mach-O files, rather than objdump.
    $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"dyld_info"} = "dyld_info";
    $obj_tool_map{"addr2line"} = "false";  # no addr2line
    $obj_tool_map{"objdump"} = "false";  # no objdump
  }
@ -5325,7 +5530,7 @@ sub GetProcedureBoundaries {
  # "nm -f $image" is supposed to fail on GNU nm, but if:
  #
  # a. $image starts with [BbSsPp] (for example, bin/foo/bar), AND
-  # b. you have a.out in your current directory (a not uncommon occurence)
+  # b. you have a.out in your current directory (a not uncommon occurrence)
  #
  # then "nm -f $image" succeeds because -f only looks at the first letter of
  # the argument, which looks valid because it's [BbSsPp], and then since
@ -5353,7 +5558,7 @@ sub GetProcedureBoundaries {
  my $demangle_flag = "";
  my $cppfilt_flag = "";
  my $to_devnull = ">$dev_null 2>&1";
-  if (system(ShellEscape($nm, "--demangle", "image") . $to_devnull) == 0) {
+  if (system(ShellEscape($nm, "--demangle", $image) . $to_devnull) == 0) {
    # In this mode, we do "nm --demangle <foo>"
    $demangle_flag = "--demangle";
    $cppfilt_flag = "";
--- a/build-aux/config.guess
+++ b/build-aux/config.guess
--- a/build-aux/config.sub
+++ b/build-aux/config.sub
--- a/build-aux/install-sh
+++ b/build-aux/install-sh
@ -115,7 +115,7 @@ fi
 if [ x"$dir_arg" != x ]; then
 	dst=$src
 	src=""
-	
+
 	if [ -d $dst ]; then
 		instcmd=:
 	else
@ -124,7 +124,7 @@ if [ x"$dir_arg" != x ]; then
 else

 # Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad 
+# might cause directories to be created, which would be especially bad
 # if $src (and thus $dsttmp) contains '*'.

 	if [ -f $src -o -d $src ]
@ -134,7 +134,7 @@ else
 		echo "install:  $src does not exist"
 		exit 1
 	fi
-	
+
 	if [ x"$dst" = x ]
 	then
 		echo "install:	no destination specified"
@ -201,17 +201,17 @@ else

 # If we're going to rename the final executable, determine the name now.

-	if [ x"$transformarg" = x ] 
+	if [ x"$transformarg" = x ]
 	then
 		dstfile=`basename $dst`
 	else
-		dstfile=`basename $dst $transformbasename | 
+		dstfile=`basename $dst $transformbasename |
 			sed $transformarg`$transformbasename
 	fi

 # don't allow the sed command to completely eliminate the filename

-	if [ x"$dstfile" = x ] 
+	if [ x"$dstfile" = x ]
 	then
 		dstfile=`basename $dst`
 	else
@ -242,7 +242,7 @@ else
 # Now rename the file to the real destination.

 	$doit $rmcmd -f $dstdir/$dstfile &&
-	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+	$doit $mvcmd $dsttmp $dstdir/$dstfile

 fi &&

--- a/configure.ac
+++ b/configure.ac
--- a/coverage.sh
+++ b/coverage.sh
@ -1,16 +0,0 @@
-#!/bin/sh
-
-set -e
-
-objdir=$1
-suffix=$2
-shift 2
-objs=$@
-
-gcov -b -p -f -o "${objdir}" ${objs}
-
-# Move gcov outputs so that subsequent gcov invocations won't clobber results
-# for the same sources with different compilation flags.
-for f in `find . -maxdepth 1 -type f -name '*.gcov'` ; do
-  mv "${f}" "${f}.${suffix}"
-done
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
--- a/doc_internal/PROFILING_INTERNALS.md
+++ b/doc_internal/PROFILING_INTERNALS.md
@ -0,0 +1,145 @@
+# jemalloc profiling
+This describes the mathematical basis behind jemalloc's profiling implementation, as well as the implementation tricks that make it effective. Historically, the jemalloc profiling design simply copied tcmalloc's. The implementation has since diverged, due to both the desire to record additional information, and to correct some biasing bugs.
+
+Note: this document is markdown with embedded LaTeX; different markdown renderers may not produce the expected output.  Viewing with `pandoc -s PROFILING_INTERNALS.md -o PROFILING_INTERNALS.pdf` is recommended.
+
+## Some tricks in our implementation toolbag
+
+### Sampling
+Recording our metadata is quite expensive; we need to walk up the stack to get a stack trace. On top of that, we need to allocate storage to record that stack trace, and stick it somewhere where a profile-dumping call can find it. That call might happen on another thread, so we'll probably need to take a lock to do so. These costs are quite large compared to the average cost of an allocation. To manage this, we'll only sample some fraction of allocations. This will miss some of them, so our data will be incomplete, but we'll try to make up for it. We can tune our sampling rate to balance accuracy and performance.
+
+### Fast Bernoulli sampling
+Compared to our fast paths, even a `coinflip(p)` function can be quite expensive. Having to do a random-number generation and some floating point operations would be a sizeable relative cost. However (as pointed out in [[Vitter, 1987](https://dl.acm.org/doi/10.1145/23002.23003)]), if we can orchestrate our algorithm so that many of our `coinflip` calls share their parameter value, we can do better. We can sample from the geometric distribution, and initialize a counter with the result. When the counter hits 0, the `coinflip` function returns true (and reinitializes its internal counter).
+This can let us do a random-number generation once per (logical) coinflip that comes up heads, rather than once per (logical) coinflip. Since we expect to sample relatively rarely, this can be a large win.
+
+### Fast-path / slow-path thinking
+Most programs have a skewed distribution of allocations. Smaller allocations are much more frequent than large ones, but shorter lived and less common as a fraction of program memory. "Small" and "large" are necessarily sort of fuzzy terms, but if we define "small" as "allocations jemalloc puts into slabs" and "large" as the others, then it's not uncommon for small allocations to be hundreds of times more frequent than large ones, but take up around half the amount of heap space as large ones. Moreover, small allocations tend to be much cheaper than large ones (often by a factor of 20-30): they're more likely to hit in thread caches, less likely to have to do an mmap, and cheaper to fill (by the user) once the allocation has been returned.
+
+## An unbiased estimator of space consumption from (almost) arbitrary sampling strategies
+Suppose we have a sampling strategy that meets the following criteria:
+
+  - One allocation being sampled is independent of other allocations being sampled.
+  - Each allocation has a non-zero probability of being sampled.
+
+We can then estimate the bytes in live allocations through some particular stack trace as:
+
+$$ \sum_i S_i I_i \frac{1}{\mathrm{E}[I_i]} $$
+
+where the sum ranges over some index variable of live allocations from that stack, $S_i$ is the size of the $i$'th allocation, and $I_i$ is an indicator random variable for whether or not the $i'th$ allocation is sampled. $S_i$ and $\mathrm{E}[I_i]$ are constants (the program allocations are fixed; the random variables are the sampling decisions), so taking the expectation we get
+
+$$ \sum_i S_i \mathrm{E}[I_i] \frac{1}{\mathrm{E}[I_i]}.$$
+
+This is of course $\sum_i S_i$, as we want (and, a similar calculation could be done for allocation counts as well).
+This is a fairly general strategy; note that while we require that sampling decisions be independent of one another's outcomes, they don't have to be independent of previous allocations, total bytes allocated, etc. You can imagine strategies that:
+
+  - Sample allocations at program startup at a higher rate than subsequent allocations
+  - Sample even-indexed allocations more frequently than odd-indexed ones (so long as no allocation has zero sampling probability)
+  - Let threads declare themselves as high-sampling-priority, and sample their allocations at an increased rate.
+
+These can all be fit into this framework to give an unbiased estimator.
+
+## Evaluating sampling strategies
+Not all strategies for picking allocations to sample are equally good, of course. Among unbiased estimators, the lower the variance, the lower the mean squared error. Using the estimator above, the variance is:
+
+$$
+\begin{aligned}
+& \mathrm{Var}[\sum_i S_i I_i \frac{1}{\mathrm{E}[I_i]}]  \\
+=& \sum_i \mathrm{Var}[S_i I_i \frac{1}{\mathrm{E}[I_i]}] \\
+=& \sum_i \frac{S_i^2}{\mathrm{E}[I_i]^2} \mathrm{Var}[I_i] \\
+=& \sum_i \frac{S_i^2}{\mathrm{E}[I_i]^2} \mathrm{Var}[I_i] \\
+=& \sum_i \frac{S_i^2}{\mathrm{E}[I_i]^2} \mathrm{E}[I_i](1 - \mathrm{E}[I_i]) \\
+=& \sum_i S_i^2 \frac{1 - \mathrm{E}[I_i]}{\mathrm{E}[I_i]}.
+\end{aligned}
+$$
+
+We can use this formula to compare various strategy choices. All else being equal, lower-variance strategies are better.
+
+## Possible sampling strategies
+Because of the desire to avoid the fast-path costs, we'd like to use our Bernoulli trick if possible. There are two obvious counters to use: a coinflip per allocation, and a coinflip per byte allocated.
+
+### Bernoulli sampling per-allocation
+An obvious strategy is to pick some large $N$, and give each allocation a $1/N$ chance of being sampled. This would let us use our Bernoulli-via-Geometric trick. Using the formula from above, we can compute the variance as:
+
+$$ \sum_i S_i^2 \frac{1 - \frac{1}{N}}{\frac{1}{N}}  = (N-1) \sum_i S_i^2.$$
+
+That is, an allocation of size $Z$ contributes a term of $(N-1)Z^2$ to the variance.
+
+### Bernoulli sampling per-byte
+Another option we have is to pick some rate $R$, and give each byte a $1/R$ chance of being picked for sampling (at which point we would sample its contained allocation). The chance of an allocation of size $Z$ being sampled, then, is
+
+$$1-(1-\frac{1}{R})^{Z}$$
+
+and an allocation of size $Z$ contributes a term of
+
+$$Z^2 \frac{(1-\frac{1}{R})^{Z}}{1-(1-\frac{1}{R})^{Z}}.$$
+
+In practical settings, $R$ is large, and so this is well-approximated by
+
+$$Z^2 \frac{e^{-Z/R}}{1 - e^{-Z/R}} .$$
+
+Just to get a sense of the dynamics here, let's look at the behavior for various values of $Z$. When $Z$ is small relative to $R$, we can use $e^z \approx 1 + x$, and conclude that the variance contributed by a small-$Z$ allocation is around
+
+$$Z^2 \frac{1-Z/R}{Z/R} \approx RZ.$$
+
+When $Z$ is comparable to $R$, the variance term is near $Z^2$ (we have $\frac{e^{-Z/R}}{1 - e^{-Z/R}} = 1$ when $Z/R = \ln 2 \approx 0.693$). When $Z$ is large relative to $R$, the variance term goes to zero.
+
+## Picking a sampling strategy
+The fast-path/slow-path dynamics of allocation patterns point us towards the per-byte sampling approach:
+
+  - The quadratic increase in variance per allocation in the first approach is quite costly when heaps have a non-negligible portion of their bytes in those allocations, which is practically often the case.
+  - The Bernoulli-per-byte approach shifts more of its samples towards large allocations, which are already a slow-path.
+  - We drive several tickers (e.g. tcache gc) by bytes allocated, and report bytes-allocated as a user-visible statistic, so we have to do all the necessary bookkeeping anyways.
+
+Indeed, this is the approach we use in jemalloc. Our heap dumps record the size of the allocation and the sampling rate $R$, and jeprof unbiases by dividing by $1 - e^{-Z/R}$.  The framework above would suggest dividing by $1-(1-1/R)^Z$; instead, we use the fact that $R$ is large in practical situations, and so $e^{-Z/R}$ is a good approximation (and faster to compute).  (Equivalently, we may also see this as the factor that falls out from viewing sampling as a Poisson process directly).
+
+## Consequences for heap dump consumers
+Using this approach means that there are a few things users need to be aware of.
+
+### Stack counts are not proportional to allocation frequencies
+If one stack appears twice as often as another, this by itself does not imply that it allocates twice as often. Consider the case in which there are only two types of allocating call stacks in a program. Stack A allocates 8 bytes, and occurs a million times in a program. Stack B allocates 8 MB, and occurs just once in a program. If our sampling rate $R$ is about 1MB, we expect stack A to show up about 8 times, and stack B to show up once. Stack A isn't 8 times more frequent than stack B, though; it's a million times more frequent.
+
+### Aggregation must be done after unbiasing samples
+Some tools manually parse heap dump output, and aggregate across stacks (or across program runs) to provide wider-scale data analyses. When doing this aggregation, though, it's important to unbias-and-then-sum, rather than sum-and-then-unbias. Reusing our example from the previous section: suppose we collect heap dumps of the program from 1 million machines. We then have 8 million samples of stack A (8 per machine, each of 8 bytes), and 1 million samples of stack B (1 per machine, each of 8 MB).
+
+If we sum first then unbias based on this formula: $1 - e^{-Z/R}$ we get:
+
+$$Z = 8,000,000 * 8 bytes = 64MB$$
+$$64MB / (1 - e^{-64MB/1MB}) \approx 64MB (Stack A)$$
+
+$$Z = 1,000,000 * 8MB = 8TB$$
+$$8TB / (1 - e^{-1TB/1MB}) \approx 8TB (Stack B)$$
+
+Clearly we are unbiasing by an infinitesimal amount, which dramatically underreports the amount of memory allocated by stack A. Whereas if we unbias first and then sum:
+
+$$Z = 8 bytes$$
+$$8 bytes / (1 - e^{-8 bytes/1MB}) \approx 1MB$$
+$$1MB * 8,000,000 = 8TB (Stack A)$$
+
+$$Z = 8MB$$
+$$8MB / (1 - e^{-8MB/1MB})  \approx 8MB$$
+$$8MB * 1,000,000 = 8TB (Stack B)$$
+
+## An avenue for future exploration
+While the framework we laid out above is pretty general, as an engineering decision we're only interested in fairly simple approaches (i.e. ones for which the chance of an allocation being sampled depends only on its size). Our job is then: for each size class $Z$, pick a probability $p_Z$ that an allocation of that size will be sampled. We made some handwave-y references to statistical distributions to justify our choices, but there's no reason we need to pick them that way. Any set of non-zero probabilities is a valid choice.
+The real limiting factor in our ability to reduce estimator variance is that fact that sampling is expensive; we want to make sure we only do it on a small fraction of allocations. Our goal, then, is to pick the $p_Z$ to minimize variance given some maximum sampling rate $P$. If we define $a_Z$ to be the fraction of allocations of size $Z$, and $l_Z$ to be the fraction of allocations of size $Z$ still alive at the time of a heap dump, then we can phrase this as an optimization problem over the choices of $p_Z$:
+
+Minimize
+
+$$ \sum_Z Z^2 l_Z \frac{1-p_Z}{p_Z} $$
+
+subject to
+
+$$ \sum_Z a_Z p_Z \leq P $$
+
+Ignoring a term that doesn't depend on $p_Z$, the objective is minimized whenever
+
+$$ \sum_Z Z^2 l_Z \frac{1}{p_Z} $$
+
+is. For a particular program, $l_Z$ and $a_Z$ are just numbers that can be obtained (exactly) from existing stats introspection facilities, and we have a fairly tractable convex optimization problem (it can be framed as a second-order cone program). It would be interesting to evaluate, for various common allocation patterns, how well our current strategy adapts. Do our actual choices for $p_Z$ closely correspond to the optimal ones? How close is the variance of our choices to the variance of the optimal strategy?
+You can imagine an implementation that actually goes all the way, and makes $p_Z$ selections a tuning parameter. I don't think this is a good use of development time for the foreseeable future; but I do wonder about the answers to some of these questions.
+
+## Implementation realities
+
+The nice story above is at least partially a lie. Initially, jeprof (copying its logic from pprof)  had the sum-then-unbias error described above.  The current version of jemalloc does the unbiasing step on a per-allocation basis internally, so that we're always tracking what the unbiased numbers "should" be.  The problem is, actually surfacing those unbiased numbers would require a breaking change to jeprof (and the various already-deployed tools that have copied its logic). Instead, we use a little bit more trickery. Since we know at dump time the numbers we want jeprof to report, we simply choose the values we'll output so that the jeprof numbers will match the true numbers.  The math is described in `src/prof_data.c` (where the only cleverness is a change of variables that lets the exponentials fall out).
+
+This has the effect of making the output of jeprof (and related tools) correct, while making its inputs incorrect.  This can be annoying to human readers of raw profiling dump output.
--- a/doc_internal/jemalloc.svg
+++ b/doc_internal/jemalloc.svg
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -0,0 +1,125 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
+#define JEMALLOC_INTERNAL_ARENA_EXTERNS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/hook.h"
+#include "jemalloc/internal/pages.h"
+#include "jemalloc/internal/stats.h"
+
+/*
+ * When the amount of pages to be purged exceeds this amount, deferred purge
+ * should happen.
+ */
+#define ARENA_DEFERRED_PURGE_NPAGES_THRESHOLD UINT64_C(1024)
+
+extern ssize_t opt_dirty_decay_ms;
+extern ssize_t opt_muzzy_decay_ms;
+
+extern percpu_arena_mode_t opt_percpu_arena;
+extern const char *const   percpu_arena_mode_names[];
+
+extern div_info_t arena_binind_div_info[SC_NBINS];
+
+extern emap_t arena_emap_global;
+
+extern size_t opt_oversize_threshold;
+extern size_t oversize_threshold;
+
+extern bool      opt_huge_arena_pac_thp;
+extern pac_thp_t huge_arena_pac_thp;
+
+/*
+ * arena_bin_offsets[binind] is the offset of the first bin shard for size class
+ * binind.
+ */
+extern uint32_t arena_bin_offsets[SC_NBINS];
+
+void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
+    size_t *nactive, size_t *ndirty, size_t *nmuzzy);
+void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
+    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
+    bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
+    hpa_shard_stats_t *hpastats);
+void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
+edata_t *arena_extent_alloc_large(
+    tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
+void arena_extent_dalloc_large_prep(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata);
+void arena_extent_ralloc_large_shrink(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t oldusize);
+void arena_extent_ralloc_large_expand(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t oldusize);
+bool arena_decay_ms_set(
+    tsdn_t *tsdn, arena_t *arena, extent_state_t state, ssize_t decay_ms);
+ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
+void    arena_decay(
+       tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all);
+uint64_t       arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
+void           arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
+void           arena_reset(tsd_t *tsd, arena_t *arena);
+void           arena_destroy(tsd_t *tsd, arena_t *arena);
+cache_bin_sz_t arena_ptr_array_fill_small(tsdn_t *tsdn, arena_t *arena,
+    szind_t binind, cache_bin_ptr_array_t *arr, const cache_bin_sz_t nfill_min,
+    const cache_bin_sz_t nfill_max, cache_bin_stats_t merge_stats);
+
+void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
+    bool zero, bool slab);
+void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
+    bool zero, bool slab, tcache_t *tcache);
+void  arena_prof_promote(
+     tsdn_t *tsdn, void *ptr, size_t usize, size_t bumped_usize);
+void arena_dalloc_promoted(
+    tsdn_t *tsdn, void *ptr, tcache_t *tcache, bool slow_path);
+void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);
+
+void  arena_dalloc_small(tsdn_t *tsdn, void *ptr);
+void  arena_ptr_array_flush(tsd_t *tsd, szind_t binind,
+     cache_bin_ptr_array_t *arr, unsigned nflush, bool small,
+     arena_t *stats_arena, cache_bin_stats_t merge_stats);
+bool  arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+     size_t extra, bool zero, size_t *newsize);
+void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
+    size_t size, size_t alignment, bool zero, bool slab, tcache_t *tcache,
+    hook_ralloc_args_t *hook_args);
+dss_prec_t      arena_dss_prec_get(arena_t *arena);
+ehooks_t       *arena_get_ehooks(arena_t *arena);
+extent_hooks_t *arena_set_extent_hooks(
+    tsd_t *tsd, arena_t *arena, extent_hooks_t *extent_hooks);
+bool    arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+void    arena_name_get(arena_t *arena, char *name);
+void    arena_name_set(arena_t *arena, const char *name);
+ssize_t arena_dirty_decay_ms_default_get(void);
+bool    arena_dirty_decay_ms_default_set(ssize_t decay_ms);
+ssize_t arena_muzzy_decay_ms_default_get(void);
+bool    arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
+bool    arena_retain_grow_limit_get_set(
+       tsd_t *tsd, arena_t *arena, size_t *old_limit, size_t *new_limit);
+unsigned arena_nthreads_get(arena_t *arena, bool internal);
+void     arena_nthreads_inc(arena_t *arena, bool internal);
+void     arena_nthreads_dec(arena_t *arena, bool internal);
+arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
+bool     arena_init_huge(tsdn_t *tsdn, arena_t *a0);
+arena_t *arena_choose_huge(tsd_t *tsd);
+size_t arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    void **ptrs, size_t nfill, bool zero);
+bool   arena_boot(sc_data_t *sc_data, base_t *base, bool hpa);
+void   arena_prefork0(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork1(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork2(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork3(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork4(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork5(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork6(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork7(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork8(tsdn_t *tsdn, arena_t *arena);
+void   arena_postfork_parent(tsdn_t *tsdn, arena_t *arena);
+void   arena_postfork_child(tsdn_t *tsdn, arena_t *arena);
+
+#endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */
--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@ -0,0 +1,27 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H
+#define JEMALLOC_INTERNAL_ARENA_INLINES_A_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_structs.h"
+
+static inline unsigned
+arena_ind_get(const arena_t *arena) {
+	return arena->ind;
+}
+
+static inline void
+arena_internal_add(arena_t *arena, size_t size) {
+	atomic_fetch_add_zu(&arena->stats.internal, size, ATOMIC_RELAXED);
+}
+
+static inline void
+arena_internal_sub(arena_t *arena, size_t size) {
+	atomic_fetch_sub_zu(&arena->stats.internal, size, ATOMIC_RELAXED);
+}
+
+static inline size_t
+arena_internal_get(arena_t *arena) {
+	return atomic_load_zu(&arena->stats.internal, ATOMIC_RELAXED);
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_A_H */
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -0,0 +1,538 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
+#define JEMALLOC_INTERNAL_ARENA_INLINES_B_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_structs.h"
+#include "jemalloc/internal/bin_inlines.h"
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_b.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/large_externs.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/prof_structs.h"
+#include "jemalloc/internal/rtree.h"
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/tcache_inlines.h"
+#include "jemalloc/internal/ticker.h"
+
+static inline arena_t *
+arena_get_from_edata(edata_t *edata) {
+	return (arena_t *)atomic_load_p(
+	    &arenas[edata_arena_ind_get(edata)], ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/*
+	 * For huge allocations, use the dedicated huge arena if both are true:
+	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
+	 * thread is not assigned to a manual arena.
+	 */
+	arena_t *tsd_arena = tsd_arena_get(tsd);
+	if (tsd_arena == NULL) {
+		tsd_arena = arena_choose(tsd, NULL);
+	}
+
+	size_t threshold = atomic_load_zu(
+	    &tsd_arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED);
+	if (unlikely(size >= threshold) && arena_is_auto(tsd_arena)) {
+		return arena_choose_huge(tsd);
+	}
+
+	return tsd_arena;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+large_dalloc_safety_checks(edata_t *edata, const void *ptr, size_t input_size) {
+	if (!config_opt_safety_checks) {
+		return false;
+	}
+
+	/*
+	 * Eagerly detect double free and sized dealloc bugs for large sizes.
+	 * The cost is low enough (as edata will be accessed anyway) to be
+	 * enabled all the time.
+	 */
+	if (unlikely(edata == NULL
+	        || edata_state_get(edata) != extent_state_active)) {
+		safety_check_fail(
+		    "Invalid deallocation detected: "
+		    "pages being freed (%p) not currently active, "
+		    "possibly caused by double free bugs.",
+		    ptr);
+		return true;
+	}
+	if (unlikely(input_size != edata_usize_get(edata)
+	        || input_size > SC_LARGE_MAXCLASS)) {
+		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
+		    /* true_size */ edata_usize_get(edata), input_size);
+		return true;
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
+    prof_info_t *prof_info, bool reset_recent) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+	assert(prof_info != NULL);
+
+	edata_t *edata = NULL;
+	bool     is_slab;
+
+	/* Static check. */
+	if (alloc_ctx == NULL) {
+		edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
+		is_slab = edata_slab_get(edata);
+	} else if (unlikely(!(is_slab = alloc_ctx->slab))) {
+		edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
+	}
+
+	if (unlikely(!is_slab)) {
+		/* edata must have been initialized at this point. */
+		assert(edata != NULL);
+		size_t usize = (alloc_ctx == NULL)
+		    ? edata_usize_get(edata)
+		    : emap_alloc_ctx_usize_get(alloc_ctx);
+		if (reset_recent
+		    && large_dalloc_safety_checks(edata, ptr, usize)) {
+			prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
+			return;
+		}
+		large_prof_info_get(tsd, edata, prof_info, reset_recent);
+	} else {
+		prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
+		/*
+		 * No need to set other fields in prof_info; they will never be
+		 * accessed if alloc_tctx == PROF_TCTX_SENTINEL.
+		 */
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_tctx_reset(
+    tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	/* Static check. */
+	if (alloc_ctx == NULL) {
+		edata_t *edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
+		if (unlikely(!edata_slab_get(edata))) {
+			large_prof_tctx_reset(edata);
+		}
+	} else {
+		if (unlikely(!alloc_ctx->slab)) {
+			edata_t *edata = emap_edata_lookup(
+			    tsd_tsdn(tsd), &arena_emap_global, ptr);
+			large_prof_tctx_reset(edata);
+		}
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
+	cassert(config_prof);
+	assert(ptr != NULL);
+
+	edata_t *edata = emap_edata_lookup(
+	    tsd_tsdn(tsd), &arena_emap_global, ptr);
+	assert(!edata_slab_get(edata));
+
+	large_prof_tctx_reset(edata);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_prof_info_set(
+    tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) {
+	cassert(config_prof);
+
+	assert(!edata_slab_get(edata));
+	large_prof_info_set(edata, tctx, size);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
+	if (unlikely(tsdn_null(tsdn))) {
+		return;
+	}
+	tsd_t *tsd = tsdn_tsd(tsdn);
+	/*
+	 * We use the ticker_geom_t to avoid having per-arena state in the tsd.
+	 * Instead of having a countdown-until-decay timer running for every
+	 * arena in every thread, we flip a coin once per tick, whose
+	 * probability of coming up heads is 1/nticks; this is effectively the
+	 * operation of the ticker_geom_t.  Each arena has the same chance of a
+	 * coinflip coming up heads (1/ARENA_DECAY_NTICKS_PER_UPDATE), so we can
+	 * use a single ticker for all of them.
+	 */
+	ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd);
+	uint64_t      *prng_state = tsd_prng_statep_get(tsd);
+	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks,
+	        tsd_reentrancy_level_get(tsd) > 0))) {
+		arena_decay(tsdn, arena, false, false);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {
+	arena_decay_ticks(tsdn, arena, 1);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
+    bool slab, tcache_t *tcache, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+
+	if (likely(tcache != NULL)) {
+		if (likely(slab)) {
+			assert(sz_can_use_slab(size));
+			return tcache_alloc_small(tsdn_tsd(tsdn), arena, tcache,
+			    size, ind, zero, slow_path);
+		} else if (likely(ind < tcache_nbins_get(tcache->tcache_slow)
+		               && !tcache_bin_disabled(ind, &tcache->bins[ind],
+		                   tcache->tcache_slow))) {
+			return tcache_alloc_large(tsdn_tsd(tsdn), arena, tcache,
+			    size, ind, zero, slow_path);
+		}
+		/* (size > tcache_max) case falls through. */
+	}
+
+	return arena_malloc_hard(tsdn, arena, size, ind, zero, slab);
+}
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+arena_aalloc(tsdn_t *tsdn, const void *ptr) {
+	edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr);
+	unsigned arena_ind = edata_arena_ind_get(edata);
+	return (arena_t *)atomic_load_p(&arenas[arena_ind], ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_salloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+	emap_alloc_ctx_t alloc_ctx;
+	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
+	assert(alloc_ctx.szind != SC_NSIZES);
+
+	return emap_alloc_ctx_usize_get(&alloc_ctx);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
+	/*
+	 * Return 0 if ptr is not within an extent managed by jemalloc.  This
+	 * function has two extra costs relative to isalloc():
+	 * - The rtree calls cannot claim to be dependent lookups, which induces
+	 *   rtree lookup load dependencies.
+	 * - The lookup may fail, so there is an extra branch to check for
+	 *   failure.
+	 */
+
+	emap_full_alloc_ctx_t full_alloc_ctx;
+	bool                  missing = emap_full_alloc_ctx_try_lookup(
+            tsdn, &arena_emap_global, ptr, &full_alloc_ctx);
+	if (missing) {
+		return 0;
+	}
+
+	if (full_alloc_ctx.edata == NULL) {
+		return 0;
+	}
+	assert(edata_state_get(full_alloc_ctx.edata) == extent_state_active);
+	/* Only slab members should be looked up via interior pointers. */
+	assert(edata_addr_get(full_alloc_ctx.edata) == ptr
+	    || edata_slab_get(full_alloc_ctx.edata));
+
+	assert(full_alloc_ctx.szind != SC_NSIZES);
+
+	return edata_usize_get(full_alloc_ctx.edata);
+}
+
+static inline void
+arena_dalloc_large_no_tcache(
+    tsdn_t *tsdn, void *ptr, szind_t szind, size_t usize) {
+	/*
+	 * szind is still needed in this function mainly becuase
+	 * szind < SC_NBINS determines not only if this is a small alloc,
+	 * but also if szind is valid (an inactive extent would have
+	 * szind == SC_NSIZES).
+	 */
+	if (config_prof && unlikely(szind < SC_NBINS)) {
+		arena_dalloc_promoted(tsdn, ptr, NULL, true);
+	} else {
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
+		if (large_dalloc_safety_checks(edata, ptr, usize)) {
+			/* See the comment in isfree. */
+			return;
+		}
+		large_dalloc(tsdn, edata);
+	}
+}
+
+static inline void
+arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
+	assert(ptr != NULL);
+
+	emap_alloc_ctx_t alloc_ctx;
+	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.szind < SC_NSIZES);
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		arena_dalloc_small(tsdn, ptr);
+	} else {
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
+    size_t usize, bool slow_path) {
+	assert(!tsdn_null(tsdn) && tcache != NULL);
+	bool is_sample_promoted = config_prof && szind < SC_NBINS;
+	if (unlikely(is_sample_promoted)) {
+		arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
+	} else {
+		if (szind < tcache_nbins_get(tcache->tcache_slow)
+		    && !tcache_bin_disabled(
+		        szind, &tcache->bins[szind], tcache->tcache_slow)) {
+			tcache_dalloc_large(
+			    tsdn_tsd(tsdn), tcache, ptr, szind, slow_path);
+		} else {
+			edata_t *edata = emap_edata_lookup(
+			    tsdn, &arena_emap_global, ptr);
+			if (large_dalloc_safety_checks(edata, ptr, usize)) {
+				/* See the comment in isfree. */
+				return;
+			}
+			large_dalloc(tsdn, edata);
+		}
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+arena_tcache_dalloc_small_safety_check(tsdn_t *tsdn, void *ptr) {
+	if (!config_debug) {
+		return false;
+	}
+	edata_t   *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr);
+	szind_t    binind = edata_szind_get(edata);
+	div_info_t div_info = arena_binind_div_info[binind];
+	/*
+	 * Calls the internal function bin_slab_regind_impl because the
+	 * safety check does not require a lock.
+	 */
+	size_t regind = bin_slab_regind_impl(&div_info, binind, edata, ptr);
+	slab_data_t      *slab_data = edata_slab_data_get(edata);
+	const bin_info_t *bin_info = &bin_infos[binind];
+	assert(edata_nfree_get(edata) < bin_info->nregs);
+	if (unlikely(!bitmap_get(
+	        slab_data->bitmap, &bin_info->bitmap_info, regind))) {
+		safety_check_fail(
+		    "Invalid deallocation detected: the pointer being freed (%p) not "
+		    "currently active, possibly caused by double free bugs.\n",
+		    ptr);
+		return true;
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
+    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+	assert(ptr != NULL);
+
+	if (unlikely(tcache == NULL)) {
+		arena_dalloc_no_tcache(tsdn, ptr);
+		return;
+	}
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (caller_alloc_ctx != NULL) {
+		alloc_ctx = *caller_alloc_ctx;
+	} else {
+		util_assume(tsdn != NULL);
+		emap_alloc_ctx_lookup(
+		    tsdn, &arena_emap_global, ptr, &alloc_ctx);
+	}
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.szind < SC_NSIZES);
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
+		tcache_dalloc_small(
+		    tsdn_tsd(tsdn), tcache, ptr, alloc_ctx.szind, slow_path);
+	} else {
+		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx), slow_path);
+	}
+}
+
+static inline void
+arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
+	assert(ptr != NULL);
+	assert(size <= SC_LARGE_MAXCLASS);
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (!config_prof || !opt_prof) {
+		/*
+		 * There is no risk of being confused by a promoted sampled
+		 * object, so base szind and slab on the given size.
+		 */
+		szind_t szind = sz_size2index(size);
+		emap_alloc_ctx_init(
+		    &alloc_ctx, szind, (szind < SC_NBINS), size);
+	}
+
+	if ((config_prof && opt_prof) || config_debug) {
+		emap_alloc_ctx_lookup(
+		    tsdn, &arena_emap_global, ptr, &alloc_ctx);
+
+		assert(alloc_ctx.szind == sz_size2index(size));
+		assert((config_prof && opt_prof)
+		    || alloc_ctx.slab == (alloc_ctx.szind < SC_NBINS));
+
+		if (config_debug) {
+			edata_t *edata = emap_edata_lookup(
+			    tsdn, &arena_emap_global, ptr);
+			assert(alloc_ctx.szind == edata_szind_get(edata));
+			assert(alloc_ctx.slab == edata_slab_get(edata));
+		}
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		arena_dalloc_small(tsdn, ptr);
+	} else {
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
+    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
+	assert(!tsdn_null(tsdn) || tcache == NULL);
+	assert(ptr != NULL);
+	assert(size <= SC_LARGE_MAXCLASS);
+
+	if (unlikely(tcache == NULL)) {
+		arena_sdalloc_no_tcache(tsdn, ptr, size);
+		return;
+	}
+
+	emap_alloc_ctx_t alloc_ctx;
+	if (config_prof && opt_prof) {
+		if (caller_alloc_ctx == NULL) {
+			/* Uncommon case and should be a static check. */
+			emap_alloc_ctx_lookup(
+			    tsdn, &arena_emap_global, ptr, &alloc_ctx);
+			assert(alloc_ctx.szind == sz_size2index(size));
+			assert(emap_alloc_ctx_usize_get(&alloc_ctx) == size);
+		} else {
+			alloc_ctx = *caller_alloc_ctx;
+		}
+	} else {
+		/*
+		 * There is no risk of being confused by a promoted sampled
+		 * object, so base szind and slab on the given size.
+		 */
+		alloc_ctx.szind = sz_size2index(size);
+		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+	}
+
+	if (config_debug) {
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
+		assert(alloc_ctx.szind == edata_szind_get(edata));
+		assert(alloc_ctx.slab == edata_slab_get(edata));
+		emap_alloc_ctx_init(
+		    &alloc_ctx, alloc_ctx.szind, alloc_ctx.slab, sz_s2u(size));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
+	}
+
+	if (likely(alloc_ctx.slab)) {
+		/* Small allocation. */
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
+		tcache_dalloc_small(
+		    tsdn_tsd(tsdn), tcache, ptr, alloc_ctx.szind, slow_path);
+	} else {
+		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
+		    sz_s2u(size), slow_path);
+	}
+}
+
+static inline void
+arena_cache_oblivious_randomize(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t alignment) {
+	assert(edata_base_get(edata) == edata_addr_get(edata));
+
+	if (alignment < PAGE) {
+		unsigned lg_range = LG_PAGE
+		    - lg_floor(CACHELINE_CEILING(alignment));
+		size_t r;
+		if (!tsdn_null(tsdn)) {
+			tsd_t *tsd = tsdn_tsd(tsdn);
+			r = (size_t)prng_lg_range_u64(
+			    tsd_prng_statep_get(tsd), lg_range);
+		} else {
+			uint64_t stack_value = (uint64_t)(uintptr_t)&r;
+			r = (size_t)prng_lg_range_u64(&stack_value, lg_range);
+		}
+		uintptr_t random_offset = ((uintptr_t)r)
+		    << (LG_PAGE - lg_range);
+		edata->e_addr = (void *)((byte_t *)edata->e_addr
+		    + random_offset);
+		assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment)
+		    == edata->e_addr);
+	}
+}
+
+static inline bin_t *
+arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) {
+	bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]);
+	return shard0 + binshard;
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_INLINES_B_H */
--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@ -0,0 +1,123 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
+#define JEMALLOC_INTERNAL_ARENA_STATS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/lockedint.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/pa.h"
+#include "jemalloc/internal/sc.h"
+
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
+typedef struct arena_stats_large_s arena_stats_large_t;
+struct arena_stats_large_s {
+	/*
+	 * Total number of large allocation/deallocation requests served directly
+	 * by the arena.
+	 */
+	locked_u64_t nmalloc;
+	locked_u64_t ndalloc;
+
+	/*
+	 * Total large active bytes (allocated - deallocated) served directly
+	 * by the arena.
+	 */
+	locked_u64_t active_bytes;
+
+	/*
+	 * Number of allocation requests that correspond to this size class.
+	 * This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	locked_u64_t nrequests; /* Partially derived. */
+	/*
+	 * Number of tcache fills / flushes for large (similarly, periodically
+	 * merged).  Note that there is no large tcache batch-fill currently
+	 * (i.e. only fill 1 at a time); however flush may be batched.
+	 */
+	locked_u64_t nfills;   /* Partially derived. */
+	locked_u64_t nflushes; /* Partially derived. */
+
+	/* Current number of allocations of this size class. */
+	size_t curlextents; /* Derived. */
+};
+
+/*
+ * Arena stats.  Note that fields marked "derived" are not directly maintained
+ * within the arena code; rather their values are derived during stats merge
+ * requests.
+ */
+typedef struct arena_stats_s arena_stats_t;
+struct arena_stats_s {
+	LOCKEDINT_MTX_DECLARE(mtx)
+
+	/*
+	 * resident includes the base stats -- that's why it lives here and not
+	 * in pa_shard_stats_t.
+	 */
+	size_t base;           /* Derived. */
+	size_t metadata_edata; /* Derived. */
+	size_t metadata_rtree; /* Derived. */
+	size_t resident;       /* Derived. */
+	size_t metadata_thp;   /* Derived. */
+	size_t mapped;         /* Derived. */
+
+	atomic_zu_t internal;
+
+	size_t   allocated_large; /* Derived. */
+	uint64_t nmalloc_large;   /* Derived. */
+	uint64_t ndalloc_large;   /* Derived. */
+	uint64_t nfills_large;    /* Derived. */
+	uint64_t nflushes_large;  /* Derived. */
+	uint64_t nrequests_large; /* Derived. */
+
+	/*
+	 * The stats logically owned by the pa_shard in the same arena.  This
+	 * lives here only because it's convenient for the purposes of the ctl
+	 * module -- it only knows about the single arena_stats.
+	 */
+	pa_shard_stats_t pa_shard_stats;
+
+	/* Number of bytes cached in tcache associated with this arena. */
+	size_t tcache_bytes;         /* Derived. */
+	size_t tcache_stashed_bytes; /* Derived. */
+
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];
+
+	/* One element for each large size class. */
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+
+	/* Arena uptime. */
+	nstime_t uptime;
+};
+
+static inline bool
+arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
+	if (config_debug) {
+		for (size_t i = 0; i < sizeof(arena_stats_t); i++) {
+			assert(((char *)arena_stats)[i] == 0);
+		}
+	}
+	if (LOCKEDINT_MTX_INIT(arena_stats->mtx, "arena_stats",
+	        WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
+		return true;
+	}
+	/* Memory is zeroed, so there is no need to clear stats. */
+	return false;
+}
+
+static inline void
+arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
+    szind_t szind, uint64_t nrequests) {
+	LOCKEDINT_MTX_LOCK(tsdn, arena_stats->mtx);
+	arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS];
+	locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx),
+	    &lstats->nrequests, nrequests);
+	locked_inc_u64(
+	    tsdn, LOCKEDINT_MTX(arena_stats->mtx), &lstats->nflushes, 1);
+	LOCKEDINT_MTX_UNLOCK(tsdn, arena_stats->mtx);
+}
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STATS_H */
--- a/include/jemalloc/internal/arena_structs.h
+++ b/include/jemalloc/internal/arena_structs.h
@ -0,0 +1,111 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H
+#define JEMALLOC_INTERNAL_ARENA_STRUCTS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/counter.h"
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/extent_dss.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/pa.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/ticker.h"
+
+struct arena_s {
+	/*
+	 * Number of threads currently assigned to this arena.  Each thread has
+	 * two distinct assignments, one for application-serving allocation, and
+	 * the other for internal metadata allocation.  Internal metadata must
+	 * not be allocated from arenas explicitly created via the arenas.create
+	 * mallctl, because the arena.<i>.reset mallctl indiscriminately
+	 * discards all allocations for the affected arena.
+	 *
+	 *   0: Application allocation.
+	 *   1: Internal metadata allocation.
+	 *
+	 * Synchronization: atomic.
+	 */
+	atomic_u_t nthreads[2];
+
+	/* Next bin shard for binding new threads. Synchronization: atomic. */
+	atomic_u_t binshard_next;
+
+	/*
+	 * When percpu_arena is enabled, to amortize the cost of reading /
+	 * updating the current CPU id, track the most recent thread accessing
+	 * this arena, and only read CPU if there is a mismatch.
+	 */
+	tsdn_t *last_thd;
+
+	/* Synchronization: internal. */
+	arena_stats_t stats;
+
+	/*
+	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
+	 * associated with this arena.  Stats from these are merged
+	 * incrementally, and at exit if opt_stats_print is enabled.
+	 *
+	 * Synchronization: tcache_ql_mtx.
+	 */
+	ql_head(tcache_slow_t) tcache_ql;
+	ql_head(cache_bin_array_descriptor_t) cache_bin_array_descriptor_ql;
+	malloc_mutex_t tcache_ql_mtx;
+
+	/*
+	 * Represents a dss_prec_t, but atomically.
+	 *
+	 * Synchronization: atomic.
+	 */
+	atomic_u_t dss_prec;
+
+	/*
+	 * Extant large allocations.
+	 *
+	 * Synchronization: large_mtx.
+	 */
+	edata_list_active_t large;
+	/* Synchronizes all large allocation/update/deallocation. */
+	malloc_mutex_t large_mtx;
+
+	/* The page-level allocator shard this arena uses. */
+	pa_shard_t pa_shard;
+
+	/*
+	 * A cached copy of base->ind.  This can get accessed on hot paths;
+	 * looking it up in base requires an extra pointer hop / cache miss.
+	 */
+	unsigned ind;
+
+	/*
+	 * Base allocator, from which arena metadata are allocated.
+	 *
+	 * Synchronization: internal.
+	 */
+	base_t *base;
+	/* Used to determine uptime.  Read-only after initialization. */
+	nstime_t create_time;
+
+	/* The name of the arena. */
+	char name[ARENA_NAME_LEN];
+
+	/*
+	 * The arena is allocated alongside its bins; really this is a
+	 * dynamically sized array determined by the binshard settings.
+	 * Enforcing cacheline-alignment to minimize the number of cachelines
+	 * touched on the hot paths.
+	 */
+	JEMALLOC_WARN_ON_USAGE(
+	    "Do not use this field directly. "
+	    "Use `arena_get_bin` instead.")
+	JEMALLOC_ALIGNED(CACHELINE)
+	bin_t all_bins[0];
+};
+
+#endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@ -0,0 +1,60 @@
+#ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
+#define JEMALLOC_INTERNAL_ARENA_TYPES_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/sc.h"
+
+/* Default decay times in milliseconds. */
+#define DIRTY_DECAY_MS_DEFAULT ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT (0)
+/* Number of event ticks between time checks. */
+#define ARENA_DECAY_NTICKS_PER_UPDATE 1000
+/* Maximum length of the arena name. */
+#define ARENA_NAME_LEN 32
+
+typedef struct arena_s arena_t;
+
+typedef enum {
+	percpu_arena_mode_names_base = 0, /* Used for options processing. */
+
+	/*
+	 * *_uninit are used only during bootstrapping, and must correspond
+	 * to initialized variant plus percpu_arena_mode_enabled_base.
+	 */
+	percpu_arena_uninit = 0,
+	per_phycpu_arena_uninit = 1,
+
+	/* All non-disabled modes must come after percpu_arena_disabled. */
+	percpu_arena_disabled = 2,
+
+	percpu_arena_mode_names_limit = 3, /* Used for options processing. */
+	percpu_arena_mode_enabled_base = 3,
+
+	percpu_arena = 3,
+	per_phycpu_arena = 4 /* Hyper threads share arena. */
+} percpu_arena_mode_t;
+
+#define PERCPU_ARENA_ENABLED(m) ((m) >= percpu_arena_mode_enabled_base)
+#define PERCPU_ARENA_DEFAULT percpu_arena_disabled
+
+/*
+ * When allocation_size >= oversize_threshold, use the dedicated huge arena
+ * (unless have explicitly spicified arena index).  0 disables the feature.
+ */
+#define OVERSIZE_THRESHOLD_DEFAULT (8 << 20)
+
+struct arena_config_s {
+	/* extent hooks to be used for the arena */
+	extent_hooks_t *extent_hooks;
+
+	/*
+	 * Use extent hooks for metadata (base) allocations when true.
+	 */
+	bool metadata_use_hooks;
+};
+
+typedef struct arena_config_s arena_config_t;
+
+extern const arena_config_t arena_config_default;
+
+#endif /* JEMALLOC_INTERNAL_ARENA_TYPES_H */
--- a/include/jemalloc/internal/assert.h
+++ b/include/jemalloc/internal/assert.h
@ -1,45 +1,63 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/util.h"
+
 /*
 * Define a custom assert() in order to reduce the chances of deadlock during
 * assertion failure.
 */
 #ifndef assert
-#define	assert(e) do {							\
-	if (unlikely(config_debug && !(e))) {				\
-		malloc_printf(						\
-		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
-		    __FILE__, __LINE__, #e);				\
-		abort();						\
-	}								\
-} while (0)
+#	define assert(e)                                                            \
+		do {                                                                 \
+			if (unlikely(config_debug && !(e))) {                        \
+				malloc_printf(                                       \
+				    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n", \
+				    __FILE__, __LINE__, #e);                         \
+				abort();                                             \
+			}                                                            \
+		} while (0)
 #endif

 #ifndef not_reached
-#define	not_reached() do {						\
-	if (config_debug) {						\
-		malloc_printf(						\
-		    "<jemalloc>: %s:%d: Unreachable code reached\n",	\
-		    __FILE__, __LINE__);				\
-		abort();						\
-	}								\
-	unreachable();							\
-} while (0)
+#	define not_reached()                                                        \
+		do {                                                                 \
+			if (config_debug) {                                          \
+				malloc_printf(                                       \
+				    "<jemalloc>: %s:%d: Unreachable code reached\n", \
+				    __FILE__, __LINE__);                             \
+				abort();                                             \
+			}                                                            \
+			unreachable();                                               \
+		} while (0)
 #endif

 #ifndef not_implemented
-#define	not_implemented() do {						\
-	if (config_debug) {						\
-		malloc_printf("<jemalloc>: %s:%d: Not implemented\n",	\
-		    __FILE__, __LINE__);				\
-		abort();						\
-	}								\
-} while (0)
+#	define not_implemented()                                              \
+		do {                                                           \
+			if (config_debug) {                                    \
+				malloc_printf(                                 \
+				    "<jemalloc>: %s:%d: Not implemented\n",    \
+				    __FILE__, __LINE__);                       \
+				abort();                                       \
+			}                                                      \
+		} while (0)
 #endif

 #ifndef assert_not_implemented
-#define	assert_not_implemented(e) do {					\
-	if (unlikely(config_debug && !(e)))				\
-		not_implemented();					\
-} while (0)
+#	define assert_not_implemented(e)                                      \
+		do {                                                           \
+			if (unlikely(config_debug && !(e))) {                  \
+				not_implemented();                             \
+			}                                                      \
+		} while (0)
 #endif

-
+/* Use to assert a particular configuration, e.g., cassert(config_debug). */
+#ifndef cassert
+#	define cassert(c)                                                     \
+		do {                                                           \
+			if (unlikely(!(c))) {                                  \
+				not_reached();                                 \
+			}                                                      \
+		} while (0)
+#endif
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@ -1,651 +1,108 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
+#ifndef JEMALLOC_INTERNAL_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_H

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+#include "jemalloc/internal/jemalloc_preamble.h"

-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
+#define JEMALLOC_U8_ATOMICS
+#if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
+#	include "jemalloc/internal/atomic_gcc_atomic.h"
+#	if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#		undef JEMALLOC_U8_ATOMICS
+#	endif
+#elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
+#	include "jemalloc/internal/atomic_gcc_sync.h"
+#	if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#		undef JEMALLOC_U8_ATOMICS
+#	endif
+#elif defined(_MSC_VER)
+#	include "jemalloc/internal/atomic_msvc.h"
+#elif defined(JEMALLOC_C11_ATOMICS)
+#	include "jemalloc/internal/atomic_c11.h"
+#else
+#	error "Don't have atomics implemented on this platform."
+#endif

-#define	atomic_read_uint64(p)	atomic_add_uint64(p, 0)
-#define	atomic_read_uint32(p)	atomic_add_uint32(p, 0)
-#define	atomic_read_p(p)	atomic_add_p(p, NULL)
-#define	atomic_read_z(p)	atomic_add_z(p, 0)
-#define	atomic_read_u(p)	atomic_add_u(p, 0)
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE

 /*
- * All arithmetic functions return the arithmetic result of the atomic
- * operation.  Some atomic operation APIs return the value prior to mutation, in
- * which case the following functions must redundantly compute the result so
- * that it can be returned.  These functions are normally inlined, so the extra
- * operations can be optimized away if the return values aren't used by the
- * callers.
+ * This header gives more or less a backport of C11 atomics. The user can write
+ * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
+ * counterparts of the C11 atomic functions for type, as so:
+ *   JEMALLOC_GENERATE_ATOMICS(int *, pi, 3);
+ * and then write things like:
+ *   int *some_ptr;
+ *   atomic_pi_t atomic_ptr_to_int;
+ *   atomic_store_pi(&atomic_ptr_to_int, some_ptr, ATOMIC_RELAXED);
+ *   int *prev_value = atomic_exchange_pi(&ptr_to_int, NULL, ATOMIC_ACQ_REL);
+ *   assert(some_ptr == prev_value);
+ * and expect things to work in the obvious way.
 *
- *   <t> atomic_read_<t>(<t> *p) { return (*p); }
- *   <t> atomic_add_<t>(<t> *p, <t> x) { return (*p += x); }
- *   <t> atomic_sub_<t>(<t> *p, <t> x) { return (*p -= x); }
- *   bool atomic_cas_<t>(<t> *p, <t> c, <t> s)
- *   {
- *     if (*p != c)
- *       return (true);
- *     *p = s;
- *     return (false);
- *   }
- *   void atomic_write_<t>(<t> *p, <t> x) { *p = x; }
+ * Also included (with naming differences to avoid conflicts with the standard
+ * library):
+ *   atomic_fence(atomic_memory_order_t) (mimics C11's atomic_thread_fence).
+ *   ATOMIC_INIT (mimics C11's ATOMIC_VAR_INIT).
 */

-#ifndef JEMALLOC_ENABLE_INLINE
-uint64_t	atomic_add_uint64(uint64_t *p, uint64_t x);
-uint64_t	atomic_sub_uint64(uint64_t *p, uint64_t x);
-bool	atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s);
-void	atomic_write_uint64(uint64_t *p, uint64_t x);
-uint32_t	atomic_add_uint32(uint32_t *p, uint32_t x);
-uint32_t	atomic_sub_uint32(uint32_t *p, uint32_t x);
-bool	atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s);
-void	atomic_write_uint32(uint32_t *p, uint32_t x);
-void	*atomic_add_p(void **p, void *x);
-void	*atomic_sub_p(void **p, void *x);
-bool	atomic_cas_p(void **p, void *c, void *s);
-void	atomic_write_p(void **p, const void *x);
-size_t	atomic_add_z(size_t *p, size_t x);
-size_t	atomic_sub_z(size_t *p, size_t x);
-bool	atomic_cas_z(size_t *p, size_t c, size_t s);
-void	atomic_write_z(size_t *p, size_t x);
-unsigned	atomic_add_u(unsigned *p, unsigned x);
-unsigned	atomic_sub_u(unsigned *p, unsigned x);
-bool	atomic_cas_u(unsigned *p, unsigned c, unsigned s);
-void	atomic_write_u(unsigned *p, unsigned x);
-#endif
+/*
+ * Pure convenience, so that we don't have to type "atomic_memory_order_"
+ * quite so often.
+ */
+#define ATOMIC_RELAXED atomic_memory_order_relaxed
+#define ATOMIC_ACQUIRE atomic_memory_order_acquire
+#define ATOMIC_RELEASE atomic_memory_order_release
+#define ATOMIC_ACQ_REL atomic_memory_order_acq_rel
+#define ATOMIC_SEQ_CST atomic_memory_order_seq_cst

-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_ATOMIC_C_))
-/******************************************************************************/
-/* 64-bit operations. */
+/*
+ * Another convenience -- simple atomic helper functions.
+ */
+#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type, lg_size)      \
+	JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)               \
+	ATOMIC_INLINE void atomic_load_add_store_##short_type(                 \
+	    atomic_##short_type##_t *a, type inc) {                            \
+		type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);     \
+		type newval = oldval + inc;                                    \
+		atomic_store_##short_type(a, newval, ATOMIC_RELAXED);          \
+	}                                                                      \
+	ATOMIC_INLINE void atomic_load_sub_store_##short_type(                 \
+	    atomic_##short_type##_t *a, type inc) {                            \
+		type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);     \
+		type newval = oldval - inc;                                    \
+		atomic_store_##short_type(a, newval, ATOMIC_RELAXED);          \
+	}
+
+/*
+ * Not all platforms have 64-bit atomics.  If we do, this #define exposes that
+ * fact.
+ */
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  if (defined(__amd64__) || defined(__x86_64__))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	uint64_t t = x;
-
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return (t + x);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	uint64_t t;
-
-	x = (uint64_t)(-(int64_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddq %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
-
-	return (t + x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-	uint8_t success;
-
-	asm volatile (
-	    "lock; cmpxchgq %4, %0;"
-	    "sete %1;"
-	    : "=m" (*p), "=a" (success) /* Outputs. */
-	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-
-	return (!(bool)success);
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-
-	asm volatile (
-	    "xchgq %1, %0;" /* Lock is implied by xchgq. */
-	    : "=m" (*p), "+r" (x) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-}
-#  elif (defined(JEMALLOC_C11ATOMICS))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return (atomic_fetch_add(a, x) + x);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return (atomic_fetch_sub(a, x) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	return (!atomic_compare_exchange_strong(a, &c, s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-	volatile atomic_uint_least64_t *a = (volatile atomic_uint_least64_t *)p;
-	atomic_store(a, x);
-}
-#  elif (defined(JEMALLOC_ATOMIC9))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	/*
-	 * atomic_fetchadd_64() doesn't exist, but we only ever use this
-	 * function on LP64 systems, so atomic_fetchadd_long() will do.
-	 */
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return (atomic_fetchadd_long(p, (unsigned long)x) + x);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return (atomic_fetchadd_long(p, (unsigned long)(-(long)x)) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	return (!atomic_cmpset_long(p, (unsigned long)c, (unsigned long)s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-
-	assert(sizeof(uint64_t) == sizeof(unsigned long));
-
-	atomic_store_rel_long(p, x);
-}
-#  elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (OSAtomicAdd64((int64_t)x, (int64_t *)p));
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (OSAtomicAdd64(-((int64_t)x), (int64_t *)p));
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-
-	return (!OSAtomicCompareAndSwap64(c, s, (int64_t *)p));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-	uint64_t o;
-
-	/*The documented OSAtomic*() API does not expose an atomic exchange. */
-	do {
-		o = atomic_read_uint64(p);
-	} while (atomic_cas_uint64(p, o, x));
-}
-#  elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (InterlockedExchangeAdd64(p, x) + x);
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (InterlockedExchangeAdd64(p, -((int64_t)x)) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-	uint64_t o;
-
-	o = InterlockedCompareExchange64(p, s, c);
-	return (o != c);
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-
-	InterlockedExchange64(p, x);
-}
-#  elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8) || \
-    defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_8))
-JEMALLOC_INLINE uint64_t
-atomic_add_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (__sync_add_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE uint64_t
-atomic_sub_uint64(uint64_t *p, uint64_t x)
-{
-
-	return (__sync_sub_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint64(uint64_t *p, uint64_t c, uint64_t s)
-{
-
-	return (!__sync_bool_compare_and_swap(p, c, s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint64(uint64_t *p, uint64_t x)
-{
-
-	__sync_lock_test_and_set(p, x);
-}
-#  else
-#    error "Missing implementation for 64-bit atomic operations"
-#  endif
+#	define JEMALLOC_ATOMIC_U64
 #endif

-/******************************************************************************/
-/* 32-bit operations. */
-#if (defined(__i386__) || defined(__amd64__) || defined(__x86_64__))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	uint32_t t = x;
+JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR)

-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+/*
+ * There's no actual guarantee that sizeof(bool) == 1, but it's true on the only
+ * platform that actually needs to know the size, MSVC.
+ */
+JEMALLOC_GENERATE_ATOMICS(bool, b, 0)

-	return (t + x);
-}
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT)

-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	uint32_t t;
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(int, i, LG_SIZEOF_INT)

-	x = (uint32_t)(-(int32_t)x);
-	t = x;
-	asm volatile (
-	    "lock; xaddl %0, %1;"
-	    : "+r" (t), "=m" (*p) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    );
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)

-	return (t + x);
-}
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)

-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-	uint8_t success;
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint8_t, u8, 0)

-	asm volatile (
-	    "lock; cmpxchgl %4, %0;"
-	    "sete %1;"
-	    : "=m" (*p), "=a" (success) /* Outputs. */
-	    : "m" (*p), "a" (c), "r" (s) /* Inputs. */
-	    : "memory"
-	    );
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint32_t, u32, 2)

-	return (!(bool)success);
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-
-	asm volatile (
-	    "xchgl %1, %0;" /* Lock is implied by xchgl. */
-	    : "=m" (*p), "+r" (x) /* Outputs. */
-	    : "m" (*p) /* Inputs. */
-	    : "memory" /* Clobbers. */
-	    );
-}
-#  elif (defined(JEMALLOC_C11ATOMICS))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return (atomic_fetch_add(a, x) + x);
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return (atomic_fetch_sub(a, x) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	return (!atomic_compare_exchange_strong(a, &c, s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-	volatile atomic_uint_least32_t *a = (volatile atomic_uint_least32_t *)p;
-	atomic_store(a, x);
-}
-#elif (defined(JEMALLOC_ATOMIC9))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (atomic_fetchadd_32(p, x) + x);
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (atomic_fetchadd_32(p, (uint32_t)(-(int32_t)x)) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-
-	return (!atomic_cmpset_32(p, c, s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-
-	atomic_store_rel_32(p, x);
-}
-#elif (defined(JEMALLOC_OSATOMIC))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (OSAtomicAdd32((int32_t)x, (int32_t *)p));
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (OSAtomicAdd32(-((int32_t)x), (int32_t *)p));
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-
-	return (!OSAtomicCompareAndSwap32(c, s, (int32_t *)p));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-	uint32_t o;
-
-	/*The documented OSAtomic*() API does not expose an atomic exchange. */
-	do {
-		o = atomic_read_uint32(p);
-	} while (atomic_cas_uint32(p, o, x));
-}
-#elif (defined(_MSC_VER))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (InterlockedExchangeAdd(p, x) + x);
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (InterlockedExchangeAdd(p, -((int32_t)x)) - x);
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-	uint32_t o;
-
-	o = InterlockedCompareExchange(p, s, c);
-	return (o != c);
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-
-	InterlockedExchange(p, x);
-}
-#elif (defined(__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4) || \
- defined(JE_FORCE_SYNC_COMPARE_AND_SWAP_4))
-JEMALLOC_INLINE uint32_t
-atomic_add_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (__sync_add_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE uint32_t
-atomic_sub_uint32(uint32_t *p, uint32_t x)
-{
-
-	return (__sync_sub_and_fetch(p, x));
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_uint32(uint32_t *p, uint32_t c, uint32_t s)
-{
-
-	return (!__sync_bool_compare_and_swap(p, c, s));
-}
-
-JEMALLOC_INLINE void
-atomic_write_uint32(uint32_t *p, uint32_t x)
-{
-
-	__sync_lock_test_and_set(p, x);
-}
-#else
-#  error "Missing implementation for 32-bit atomic operations"
+#ifdef JEMALLOC_ATOMIC_U64
+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(uint64_t, u64, 3)
 #endif

-/******************************************************************************/
-/* Pointer operations. */
-JEMALLOC_INLINE void *
-atomic_add_p(void **p, void *x)
-{
+#undef ATOMIC_INLINE

-#if (LG_SIZEOF_PTR == 3)
-	return ((void *)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
-#elif (LG_SIZEOF_PTR == 2)
-	return ((void *)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE void *
-atomic_sub_p(void **p, void *x)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	return ((void *)atomic_add_uint64((uint64_t *)p,
-	    (uint64_t)-((int64_t)x)));
-#elif (LG_SIZEOF_PTR == 2)
-	return ((void *)atomic_add_uint32((uint32_t *)p,
-	    (uint32_t)-((int32_t)x)));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_p(void **p, void *c, void *s)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
-#elif (LG_SIZEOF_PTR == 2)
-	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_p(void **p, const void *x)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-/* size_t operations. */
-JEMALLOC_INLINE size_t
-atomic_add_z(size_t *p, size_t x)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	return ((size_t)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
-#elif (LG_SIZEOF_PTR == 2)
-	return ((size_t)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE size_t
-atomic_sub_z(size_t *p, size_t x)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	return ((size_t)atomic_add_uint64((uint64_t *)p,
-	    (uint64_t)-((int64_t)x)));
-#elif (LG_SIZEOF_PTR == 2)
-	return ((size_t)atomic_add_uint32((uint32_t *)p,
-	    (uint32_t)-((int32_t)x)));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_z(size_t *p, size_t c, size_t s)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
-#elif (LG_SIZEOF_PTR == 2)
-	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_z(size_t *p, size_t x)
-{
-
-#if (LG_SIZEOF_PTR == 3)
-	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_PTR == 2)
-	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-/* unsigned operations. */
-JEMALLOC_INLINE unsigned
-atomic_add_u(unsigned *p, unsigned x)
-{
-
-#if (LG_SIZEOF_INT == 3)
-	return ((unsigned)atomic_add_uint64((uint64_t *)p, (uint64_t)x));
-#elif (LG_SIZEOF_INT == 2)
-	return ((unsigned)atomic_add_uint32((uint32_t *)p, (uint32_t)x));
-#endif
-}
-
-JEMALLOC_INLINE unsigned
-atomic_sub_u(unsigned *p, unsigned x)
-{
-
-#if (LG_SIZEOF_INT == 3)
-	return ((unsigned)atomic_add_uint64((uint64_t *)p,
-	    (uint64_t)-((int64_t)x)));
-#elif (LG_SIZEOF_INT == 2)
-	return ((unsigned)atomic_add_uint32((uint32_t *)p,
-	    (uint32_t)-((int32_t)x)));
-#endif
-}
-
-JEMALLOC_INLINE bool
-atomic_cas_u(unsigned *p, unsigned c, unsigned s)
-{
-
-#if (LG_SIZEOF_INT == 3)
-	return (atomic_cas_uint64((uint64_t *)p, (uint64_t)c, (uint64_t)s));
-#elif (LG_SIZEOF_INT == 2)
-	return (atomic_cas_uint32((uint32_t *)p, (uint32_t)c, (uint32_t)s));
-#endif
-}
-
-JEMALLOC_INLINE void
-atomic_write_u(unsigned *p, unsigned x)
-{
-
-#if (LG_SIZEOF_INT == 3)
-	atomic_write_uint64((uint64_t *)p, (uint64_t)x);
-#elif (LG_SIZEOF_INT == 2)
-	atomic_write_uint32((uint32_t *)p, (uint32_t)x);
-#endif
-}
-
-/******************************************************************************/
-#endif
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+#endif /* JEMALLOC_INTERNAL_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_c11.h
+++ b/include/jemalloc/internal/atomic_c11.h
@ -0,0 +1,94 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
+#define JEMALLOC_INTERNAL_ATOMIC_C11_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include <stdatomic.h>
+
+#define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
+
+#define atomic_memory_order_t memory_order
+#define atomic_memory_order_relaxed memory_order_relaxed
+#define atomic_memory_order_acquire memory_order_acquire
+#define atomic_memory_order_release memory_order_release
+#define atomic_memory_order_acq_rel memory_order_acq_rel
+#define atomic_memory_order_seq_cst memory_order_seq_cst
+
+#define atomic_fence atomic_thread_fence
+
+/* clang-format off */
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef _Atomic(type) atomic_##short_type##_t;				\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	/*								\
+	 * A strict interpretation of the C standard prevents		\
+	 * atomic_load from taking a const argument, but it's		\
+	 * convenient for our purposes. This cast is a workaround.	\
+	 */								\
+	atomic_##short_type##_t* a_nonconst =				\
+	    (atomic_##short_type##_t*)a;				\
+	return atomic_load_explicit(a_nonconst, mo);			\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	atomic_store_explicit(a, val, mo);				\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
+    atomic_memory_order_t mo) {						\
+	return atomic_exchange_explicit(a, val, mo);			\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_weak_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired, atomic_memory_order_t success_mo,	\
+    atomic_memory_order_t failure_mo) {					\
+	return atomic_compare_exchange_strong_explicit(a, expected,	\
+	    desired, success_mo, failure_mo);				\
+}
+/* clang-format on */
+
+/*
+ * Integral types have some special operations available that non-integral ones
+ * lack.
+ */
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_add_explicit(a, val, mo);                  \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_sub_explicit(a, val, mo);                  \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_and_explicit(a, val, mo);                  \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_or_explicit(a, val, mo);                   \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_xor_explicit(a, val, mo);                  \
+	}
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */
--- a/include/jemalloc/internal/atomic_gcc_atomic.h
+++ b/include/jemalloc/internal/atomic_gcc_atomic.h
@ -0,0 +1,121 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE int
+atomic_enum_to_builtin(atomic_memory_order_t mo) {
+	switch (mo) {
+	case atomic_memory_order_relaxed:
+		return __ATOMIC_RELAXED;
+	case atomic_memory_order_acquire:
+		return __ATOMIC_ACQUIRE;
+	case atomic_memory_order_release:
+		return __ATOMIC_RELEASE;
+	case atomic_memory_order_acq_rel:
+		return __ATOMIC_ACQ_REL;
+	case atomic_memory_order_seq_cst:
+		return __ATOMIC_SEQ_CST;
+	}
+	/* Can't happen; the switch is exhaustive. */
+	not_reached();
+}
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	__atomic_thread_fence(atomic_enum_to_builtin(mo));
+}
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+	typedef struct {                                                       \
+		type repr;                                                     \
+	} atomic_##short_type##_t;                                             \
+                                                                               \
+	ATOMIC_INLINE type atomic_load_##short_type(                           \
+	    const atomic_##short_type##_t *a, atomic_memory_order_t mo) {      \
+		type result;                                                   \
+		__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));  \
+		return result;                                                 \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE void atomic_store_##short_type(                          \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_exchange_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		type result;                                                   \
+		__atomic_exchange(                                             \
+		    &a->repr, &val, &result, atomic_enum_to_builtin(mo));      \
+		return result;                                                 \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_weak_##short_type(          \
+	    atomic_##short_type##_t *a, UNUSED type *expected, type desired,   \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		return __atomic_compare_exchange(&a->repr, expected, &desired, \
+		    true, atomic_enum_to_builtin(success_mo),                  \
+		    atomic_enum_to_builtin(failure_mo));                       \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_strong_##short_type(        \
+	    atomic_##short_type##_t *a, UNUSED type *expected, type desired,   \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		return __atomic_compare_exchange(&a->repr, expected, &desired, \
+		    false, atomic_enum_to_builtin(success_mo),                 \
+		    atomic_enum_to_builtin(failure_mo));                       \
+	}
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_add(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_sub(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_and(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_or(                                      \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_xor(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}
+
+#undef ATOMIC_INLINE
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@ -0,0 +1,197 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+#define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	/* Easy cases first: no barrier, and full barrier. */
+	if (mo == atomic_memory_order_relaxed) {
+		asm volatile("" ::: "memory");
+		return;
+	}
+	if (mo == atomic_memory_order_seq_cst) {
+		asm volatile("" ::: "memory");
+		__sync_synchronize();
+		asm volatile("" ::: "memory");
+		return;
+	}
+	asm volatile("" ::: "memory");
+#if defined(__i386__) || defined(__x86_64__)
+	/* This is implicit on x86. */
+#elif defined(__ppc64__)
+	asm volatile("lwsync");
+#elif defined(__ppc__)
+	asm volatile("sync");
+#elif defined(__sparc__) && defined(__arch64__)
+	if (mo == atomic_memory_order_acquire) {
+		asm volatile("membar #LoadLoad | #LoadStore");
+	} else if (mo == atomic_memory_order_release) {
+		asm volatile("membar #LoadStore | #StoreStore");
+	} else {
+		asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
+	}
+#else
+	__sync_synchronize();
+#endif
+	asm volatile("" ::: "memory");
+}
+
+/*
+ * A correct implementation of seq_cst loads and stores on weakly ordered
+ * architectures could do either of the following:
+ *   1. store() is weak-fence -> store -> strong fence, load() is load ->
+ *      strong-fence.
+ *   2. store() is strong-fence -> store, load() is strong-fence -> load ->
+ *      weak-fence.
+ * The tricky thing is, load() and store() above can be the load or store
+ * portions of a gcc __sync builtin, so we have to follow GCC's lead, which
+ * means going with strategy 2.
+ * On strongly ordered architectures, the natural strategy is to stick a strong
+ * fence after seq_cst stores, and have naked loads.  So we want the strong
+ * fences in different places on different architectures.
+ * atomic_pre_sc_load_fence and atomic_post_sc_store_fence allow us to
+ * accomplish this.
+ */
+
+ATOMIC_INLINE void
+atomic_pre_sc_load_fence() {
+#if defined(__i386__) || defined(__x86_64__)                                   \
+    || (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_relaxed);
+#else
+	atomic_fence(atomic_memory_order_seq_cst);
+#endif
+}
+
+ATOMIC_INLINE void
+atomic_post_sc_store_fence() {
+#if defined(__i386__) || defined(__x86_64__)                                   \
+    || (defined(__sparc__) && defined(__arch64__))
+	atomic_fence(atomic_memory_order_seq_cst);
+#else
+	atomic_fence(atomic_memory_order_relaxed);
+#endif
+}
+
+/* clang-format off */
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
+    /* unused */ lg_size)						\
+typedef struct {							\
+	type volatile repr;						\
+} atomic_##short_type##_t;						\
+									\
+ATOMIC_INLINE type							\
+atomic_load_##short_type(const atomic_##short_type##_t *a,		\
+    atomic_memory_order_t mo) {						\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_pre_sc_load_fence();				\
+	}								\
+	type result = a->repr;						\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_acquire);		\
+	}								\
+	return result;							\
+}									\
+									\
+ATOMIC_INLINE void							\
+atomic_store_##short_type(atomic_##short_type##_t *a,			\
+    type val, atomic_memory_order_t mo) {				\
+	if (mo != atomic_memory_order_relaxed) {			\
+		atomic_fence(atomic_memory_order_release);		\
+	}								\
+	a->repr = val;							\
+	if (mo == atomic_memory_order_seq_cst) {			\
+		atomic_post_sc_store_fence();				\
+	}								\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_exchange_##short_type(atomic_##short_type##_t *a, type val, \
+    atomic_memory_order_t mo) {                  					 \
+	/*								\
+	 * Because of FreeBSD, we care about gcc 4.2, which doesn't have\
+	 * an atomic exchange builtin.  We fake it with a CAS loop.	\
+	 */								\
+	while (true) {							\
+		type old = a->repr;					\
+		if (__sync_bool_compare_and_swap(&a->repr, old, val)) {	\
+			return old;					\
+		}							\
+	}								\
+}									\
+									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired,                                     \
+    atomic_memory_order_t success_mo,                          \
+    atomic_memory_order_t failure_mo) {				                \
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}									\
+ATOMIC_INLINE bool							\
+atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
+    type *expected, type desired,                                       \
+    atomic_memory_order_t success_mo,                            \
+    atomic_memory_order_t failure_mo) {                          \
+	type prev = __sync_val_compare_and_swap(&a->repr, *expected,	\
+	    desired);							\
+	if (prev == *expected) {					\
+		return true;						\
+	} else {							\
+		*expected = prev;					\
+		return false;						\
+	}								\
+}
+/* clang-format on */
+
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_add(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_sub(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_and(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_or(&a->repr, val);                     \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_xor(&a->repr, val);                    \
+	}
+
+#undef ATOMIC_INLINE
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
--- a/include/jemalloc/internal/atomic_msvc.h
+++ b/include/jemalloc/internal/atomic_msvc.h
@ -0,0 +1,163 @@
+#ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+#define JEMALLOC_INTERNAL_ATOMIC_MSVC_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }
+
+typedef enum {
+	atomic_memory_order_relaxed,
+	atomic_memory_order_acquire,
+	atomic_memory_order_release,
+	atomic_memory_order_acq_rel,
+	atomic_memory_order_seq_cst
+} atomic_memory_order_t;
+
+typedef char    atomic_repr_0_t;
+typedef short   atomic_repr_1_t;
+typedef long    atomic_repr_2_t;
+typedef __int64 atomic_repr_3_t;
+
+ATOMIC_INLINE void
+atomic_fence(atomic_memory_order_t mo) {
+	_ReadWriteBarrier();
+#if defined(_M_ARM) || defined(_M_ARM64)
+	/* ARM needs a barrier for everything but relaxed. */
+	if (mo != atomic_memory_order_relaxed) {
+		MemoryBarrier();
+	}
+#elif defined(_M_IX86) || defined(_M_X64)
+	/* x86 needs a barrier only for seq_cst. */
+	if (mo == atomic_memory_order_seq_cst) {
+		MemoryBarrier();
+	}
+#else
+#	error "Don't know how to create atomics for this platform for MSVC."
+#endif
+	_ReadWriteBarrier();
+}
+
+#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_##lg_size##_t
+
+#define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b)
+#define ATOMIC_RAW_CONCAT(a, b) a##b
+
+#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size)                            \
+	ATOMIC_CONCAT(base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))
+
+#define ATOMIC_INTERLOCKED_SUFFIX(lg_size)                                     \
+	ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)
+
+#define ATOMIC_INTERLOCKED_SUFFIX_0 8
+#define ATOMIC_INTERLOCKED_SUFFIX_1 16
+#define ATOMIC_INTERLOCKED_SUFFIX_2
+#define ATOMIC_INTERLOCKED_SUFFIX_3 64
+
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)                   \
+	typedef struct {                                                       \
+		ATOMIC_INTERLOCKED_REPR(lg_size) repr;                         \
+	} atomic_##short_type##_t;                                             \
+                                                                               \
+	ATOMIC_INLINE type atomic_load_##short_type(                           \
+	    const atomic_##short_type##_t *a, atomic_memory_order_t mo) {      \
+		ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;                \
+		if (mo != atomic_memory_order_relaxed) {                       \
+			atomic_fence(atomic_memory_order_acquire);             \
+		}                                                              \
+		return (type)ret;                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE void atomic_store_##short_type(                          \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		if (mo != atomic_memory_order_relaxed) {                       \
+			atomic_fence(atomic_memory_order_release);             \
+		}                                                              \
+		a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size))val;               \
+		if (mo == atomic_memory_order_seq_cst) {                       \
+			atomic_fence(atomic_memory_order_seq_cst);             \
+		}                                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_exchange_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,     \
+		    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_weak_##short_type(          \
+	    atomic_##short_type##_t *a, type *expected, type desired,          \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		e = (ATOMIC_INTERLOCKED_REPR(lg_size)) * expected;             \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		d = (ATOMIC_INTERLOCKED_REPR(lg_size))desired;                 \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		old = ATOMIC_INTERLOCKED_NAME(                                 \
+		    _InterlockedCompareExchange, lg_size)(&a->repr, d, e);     \
+		if (old == e) {                                                \
+			return true;                                           \
+		} else {                                                       \
+			*expected = (type)old;                                 \
+			return false;                                          \
+		}                                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_strong_##short_type(        \
+	    atomic_##short_type##_t *a, type *expected, type desired,          \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		/* We implement the weak version with strong semantics. */     \
+		return atomic_compare_exchange_weak_##short_type(              \
+		    a, expected, desired, success_mo, failure_mo);             \
+	}
+
+/* clang-format off */
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)	\
+JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)			\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchangeAdd,	\
+	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
+}									\
+									\
+ATOMIC_INLINE type							\
+atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	/*								\
+	 * MSVC warns on negation of unsigned operands, but for us it	\
+	 * gives exactly the right semantics (MAX_TYPE + 1 - operand).	\
+	 */								\
+	__pragma(warning(push))						\
+	__pragma(warning(disable: 4146))				\
+	return atomic_fetch_add_##short_type(a, -val, mo);		\
+	__pragma(warning(pop))						\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedAnd, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedOr, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}									\
+ATOMIC_INLINE type							\
+atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
+    type val, atomic_memory_order_t mo) {				\
+	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)(	\
+	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
+}
+/* clang-format on */
+
+#undef ATOMIC_INLINE
+
+#endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@ -0,0 +1,38 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/mutex.h"
+
+extern bool                      opt_background_thread;
+extern size_t                    opt_max_background_threads;
+extern malloc_mutex_t            background_thread_lock;
+extern atomic_b_t                background_thread_enabled_state;
+extern size_t                    n_background_threads;
+extern size_t                    max_background_threads;
+extern background_thread_info_t *background_thread_info;
+
+bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
+bool background_threads_enable(tsd_t *tsd);
+bool background_threads_disable(tsd_t *tsd);
+bool background_thread_is_started(background_thread_info_t *info);
+void background_thread_wakeup_early(
+    background_thread_info_t *info, nstime_t *remaining_sleep);
+void background_thread_prefork0(tsdn_t *tsdn);
+void background_thread_prefork1(tsdn_t *tsdn);
+void background_thread_postfork_parent(tsdn_t *tsdn);
+void background_thread_postfork_child(tsdn_t *tsdn);
+bool background_thread_stats_read(
+    tsdn_t *tsdn, background_thread_stats_t *stats);
+void background_thread_ctl_init(tsdn_t *tsdn);
+
+#ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
+extern int pthread_create_wrapper(pthread_t *__restrict, const pthread_attr_t *,
+    void *(*)(void *), void *__restrict);
+#endif
+bool background_thread_boot0(void);
+bool background_thread_boot1(tsdn_t *tsdn, base_t *base);
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H */
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@ -0,0 +1,58 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/background_thread_externs.h"
+
+JEMALLOC_ALWAYS_INLINE bool
+background_thread_enabled(void) {
+	return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+background_thread_enabled_set_impl(bool state) {
+	atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+background_thread_enabled_set(tsdn_t *tsdn, bool state) {
+	malloc_mutex_assert_owner(tsdn, &background_thread_lock);
+	background_thread_enabled_set_impl(state);
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+arena_background_thread_info_get(arena_t *arena) {
+	unsigned arena_ind = arena_ind_get(arena);
+	return &background_thread_info[arena_ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE background_thread_info_t *
+background_thread_info_get(size_t ind) {
+	return &background_thread_info[ind % max_background_threads];
+}
+
+JEMALLOC_ALWAYS_INLINE uint64_t
+background_thread_wakeup_time_get(background_thread_info_t *info) {
+	uint64_t next_wakeup = nstime_ns(&info->next_wakeup);
+	assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE)
+	    == (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP));
+	return next_wakeup;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+background_thread_wakeup_time_set(
+    tsdn_t *tsdn, background_thread_info_t *info, uint64_t wakeup_time) {
+	malloc_mutex_assert_owner(tsdn, &info->mtx);
+	atomic_store_b(&info->indefinite_sleep,
+	    wakeup_time == BACKGROUND_THREAD_INDEFINITE_SLEEP, ATOMIC_RELEASE);
+	nstime_init(&info->next_wakeup, wakeup_time);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+background_thread_indefinite_sleep(background_thread_info_t *info) {
+	return atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE);
+}
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H */
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@ -0,0 +1,69 @@
+#ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
+#define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
+
+/* This file really combines "structs" and "types", but only transitionally. */
+
+#if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK)
+#	define JEMALLOC_PTHREAD_CREATE_WRAPPER
+#endif
+
+#define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
+#define MAX_BACKGROUND_THREAD_LIMIT MALLOCX_ARENA_LIMIT
+#define DEFAULT_NUM_BACKGROUND_THREAD 4
+
+/*
+ * These exist only as a transitional state.  Eventually, deferral should be
+ * part of the PAI, and each implementation can indicate wait times with more
+ * specificity.
+ */
+#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_UNINITIALIZED (-2)
+#define BACKGROUND_THREAD_HPA_INTERVAL_MAX_DEFAULT_WHEN_ENABLED 5000
+
+#define BACKGROUND_THREAD_DEFERRED_MIN UINT64_C(0)
+#define BACKGROUND_THREAD_DEFERRED_MAX UINT64_MAX
+
+typedef enum {
+	background_thread_stopped,
+	background_thread_started,
+	/* Thread waits on the global lock when paused (for arena_reset). */
+	background_thread_paused,
+} background_thread_state_t;
+
+struct background_thread_info_s {
+#ifdef JEMALLOC_BACKGROUND_THREAD
+	/* Background thread is pthread specific. */
+	pthread_t      thread;
+	pthread_cond_t cond;
+#endif
+	malloc_mutex_t            mtx;
+	background_thread_state_t state;
+	/* When true, it means no wakeup scheduled. */
+	atomic_b_t indefinite_sleep;
+	/* Next scheduled wakeup time (absolute time in ns). */
+	nstime_t next_wakeup;
+	/*
+	 *  Since the last background thread run, newly added number of pages
+	 *  that need to be purged by the next wakeup.  This is adjusted on
+	 *  epoch advance, and is used to determine whether we should signal the
+	 *  background thread to wake up earlier.
+	 */
+	size_t npages_to_purge_new;
+	/* Stats: total number of runs since started. */
+	uint64_t tot_n_runs;
+	/* Stats: total sleep time since started. */
+	nstime_t tot_sleep_time;
+};
+typedef struct background_thread_info_s background_thread_info_t;
+
+struct background_thread_stats_s {
+	size_t            num_threads;
+	uint64_t          num_runs;
+	nstime_t          run_interval;
+	mutex_prof_data_t max_counter_per_bg_thd;
+};
+typedef struct background_thread_stats_s background_thread_stats_t;
+
+#endif /* JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H */
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@ -1,25 +1,125 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
+#ifndef JEMALLOC_INTERNAL_BASE_H
+#define JEMALLOC_INTERNAL_BASE_H

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/mutex.h"

-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
+/*
+ * Alignment when THP is not enabled.  Set to constant 2M in case the HUGEPAGE
+ * value is unexpected high (which would cause VM over-reservation).
+ */
+#define BASE_BLOCK_MIN_ALIGN ((size_t)2 << 20)

-void	*base_alloc(tsdn_t *tsdn, size_t size);
-void	base_stats_get(tsdn_t *tsdn, size_t *allocated, size_t *resident,
-    size_t *mapped);
-bool	base_boot(void);
-void	base_prefork(tsdn_t *tsdn);
-void	base_postfork_parent(tsdn_t *tsdn);
-void	base_postfork_child(tsdn_t *tsdn);
+enum metadata_thp_mode_e {
+	metadata_thp_disabled = 0,
+	/*
+	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
+	 * + low usage arena (i.e. THP becomes a significant percentage), the
+	 * "auto" option only starts using THP after a base allocator used up
+	 * the first THP region.  Starting from the second hugepage (in a single
+	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
+	 * right away.
+	 */
+	metadata_thp_auto = 1,
+	metadata_thp_always = 2,
+	metadata_thp_mode_limit = 3
+};
+typedef enum metadata_thp_mode_e metadata_thp_mode_t;

-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
+#define METADATA_THP_DEFAULT metadata_thp_disabled
+extern metadata_thp_mode_t opt_metadata_thp;
+extern const char *const   metadata_thp_mode_names[];

-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+/* Embedded at the beginning of every block of base-managed virtual memory. */
+typedef struct base_block_s base_block_t;
+struct base_block_s {
+	/* Total size of block's virtual memory mapping. */
+	size_t size;
+
+	/* Next block in list of base's blocks. */
+	base_block_t *next;
+
+	/* Tracks unused trailing space. */
+	edata_t edata;
+};
+
+typedef struct base_s base_t;
+struct base_s {
+	/*
+	 * User-configurable extent hook functions.
+	 */
+	ehooks_t ehooks;
+
+	/*
+	 * User-configurable extent hook functions for metadata allocations.
+	 */
+	ehooks_t ehooks_base;
+
+	/* Protects base_alloc() and base_stats_get() operations. */
+	malloc_mutex_t mtx;
+
+	/* Using THP when true (metadata_thp auto mode). */
+	bool auto_thp_switched;
+	/*
+	 * Most recent size class in the series of increasingly large base
+	 * extents.  Logarithmic spacing between subsequent allocations ensures
+	 * that the total number of distinct mappings remains small.
+	 */
+	pszind_t pind_last;
+
+	/* Serial number generation state. */
+	size_t extent_sn_next;
+
+	/* Chain of all blocks associated with base. */
+	base_block_t *blocks;
+
+	/* Heap of extents that track unused trailing space within blocks. */
+	edata_heap_t avail[SC_NSIZES];
+
+	/* Contains reusable base edata (used by tcache_stacks currently). */
+	edata_avail_t edata_avail;
+
+	/* Stats, only maintained if config_stats. */
+	size_t allocated;
+	size_t edata_allocated;
+	size_t rtree_allocated;
+	size_t resident;
+	size_t mapped;
+	/* Number of THP regions touched. */
+	size_t n_thp;
+};
+
+static inline unsigned
+base_ind_get(const base_t *base) {
+	return ehooks_ind_get(&base->ehooks);
+}
+
+static inline bool
+metadata_thp_enabled(void) {
+	return (opt_metadata_thp != metadata_thp_disabled);
+}
+
+base_t *b0get(void);
+base_t *base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
+    bool metadata_use_hooks);
+void    base_delete(tsdn_t *tsdn, base_t *base);
+ehooks_t       *base_ehooks_get(base_t *base);
+ehooks_t       *base_ehooks_get_for_metadata(base_t *base);
+extent_hooks_t *base_extent_hooks_set(
+    base_t *base, extent_hooks_t *extent_hooks);
+void    *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
+edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
+void    *base_alloc_rtree(tsdn_t *tsdn, base_t *base, size_t size);
+void    *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size);
+void     b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack);
+void     base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
+        size_t *edata_allocated, size_t *rtree_allocated, size_t *resident,
+        size_t *mapped, size_t *n_thp);
+void     base_prefork(tsdn_t *tsdn, base_t *base);
+void     base_postfork_parent(tsdn_t *tsdn, base_t *base);
+void     base_postfork_child(tsdn_t *tsdn, base_t *base);
+bool     base_boot(tsdn_t *tsdn);
+
+#endif /* JEMALLOC_INTERNAL_BASE_H */
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -0,0 +1,121 @@
+#ifndef JEMALLOC_INTERNAL_BIN_H
+#define JEMALLOC_INTERNAL_BIN_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bin_info.h"
+#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/bin_types.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * A bin contains a set of extents that are currently being used for slab
+ * allocations.
+ */
+typedef struct bin_s bin_t;
+struct bin_s {
+	/* All operations on bin_t fields require lock ownership. */
+	malloc_mutex_t lock;
+
+	/*
+	 * Bin statistics.  These get touched every time the lock is acquired,
+	 * so put them close by in the hopes of getting some cache locality.
+	 */
+	bin_stats_t stats;
+
+	/*
+	 * Current slab being used to service allocations of this bin's size
+	 * class.  slabcur is independent of slabs_{nonfull,full}; whenever
+	 * slabcur is reassigned, the previous slab must be deallocated or
+	 * inserted into slabs_{nonfull,full}.
+	 */
+	edata_t *slabcur;
+
+	/*
+	 * Heap of non-full slabs.  This heap is used to assure that new
+	 * allocations come from the non-full slab that is oldest/lowest in
+	 * memory.
+	 */
+	edata_heap_t slabs_nonfull;
+
+	/* List used to track full slabs. */
+	edata_list_active_t slabs_full;
+};
+
+/* A set of sharded bins of the same size class. */
+typedef struct bins_s bins_t;
+struct bins_s {
+	/* Sharded bins.  Dynamically sized. */
+	bin_t *bin_shards;
+};
+
+void bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]);
+bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
+    size_t end_size, size_t nshards);
+
+/* Initializes a bin to empty.  Returns true on error. */
+bool bin_init(bin_t *bin);
+
+/* Forking. */
+void bin_prefork(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
+void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);
+
+/* Slab region allocation. */
+void *bin_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info);
+void  bin_slab_reg_alloc_batch(
+     edata_t *slab, const bin_info_t *bin_info, unsigned cnt, void **ptrs);
+
+/* Slab list management. */
+void     bin_slabs_nonfull_insert(bin_t *bin, edata_t *slab);
+void     bin_slabs_nonfull_remove(bin_t *bin, edata_t *slab);
+edata_t *bin_slabs_nonfull_tryget(bin_t *bin);
+void     bin_slabs_full_insert(bool is_auto, bin_t *bin, edata_t *slab);
+void     bin_slabs_full_remove(bool is_auto, bin_t *bin, edata_t *slab);
+
+/* Slab association / demotion. */
+void bin_dissociate_slab(bool is_auto, edata_t *slab, bin_t *bin);
+void bin_lower_slab(tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+
+/* Deallocation helpers (called under bin lock). */
+void bin_dalloc_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin);
+void bin_dalloc_locked_handle_newly_empty(
+    tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+void bin_dalloc_locked_handle_newly_nonempty(
+    tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+
+/* Slabcur refill and allocation. */
+void  bin_refill_slabcur_with_fresh_slab(tsdn_t *tsdn, bin_t *bin,
+    szind_t binind, edata_t *fresh_slab);
+void *bin_malloc_with_fresh_slab(tsdn_t *tsdn, bin_t *bin,
+    szind_t binind, edata_t *fresh_slab);
+bool  bin_refill_slabcur_no_fresh_slab(tsdn_t *tsdn, bool is_auto,
+    bin_t *bin);
+void *bin_malloc_no_fresh_slab(tsdn_t *tsdn, bool is_auto, bin_t *bin,
+    szind_t binind);
+
+/* Bin selection. */
+bin_t *bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard_p);
+
+/* Stats. */
+static inline void
+bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) {
+	malloc_mutex_lock(tsdn, &bin->lock);
+	malloc_mutex_prof_accum(tsdn, &dst_bin_stats->mutex_data, &bin->lock);
+	bin_stats_t *stats = &dst_bin_stats->stats_data;
+	stats->nmalloc += bin->stats.nmalloc;
+	stats->ndalloc += bin->stats.ndalloc;
+	stats->nrequests += bin->stats.nrequests;
+	stats->curregs += bin->stats.curregs;
+	stats->nfills += bin->stats.nfills;
+	stats->nflushes += bin->stats.nflushes;
+	stats->nslabs += bin->stats.nslabs;
+	stats->reslabs += bin->stats.reslabs;
+	stats->curslabs += bin->stats.curslabs;
+	stats->nonfull_slabs += bin->stats.nonfull_slabs;
+	malloc_mutex_unlock(tsdn, &bin->lock);
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_H */
--- a/include/jemalloc/internal/bin_info.h
+++ b/include/jemalloc/internal/bin_info.h
@ -0,0 +1,51 @@
+#ifndef JEMALLOC_INTERNAL_BIN_INFO_H
+#define JEMALLOC_INTERNAL_BIN_INFO_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bitmap.h"
+
+/*
+ * Read-only information associated with each element of arena_t's bins array
+ * is stored separately, partly to reduce memory usage (only one copy, rather
+ * than one per arena), but mainly to avoid false cacheline sharing.
+ *
+ * Each slab has the following layout:
+ *
+ *   /--------------------\
+ *   | region 0           |
+ *   |--------------------|
+ *   | region 1           |
+ *   |--------------------|
+ *   | ...                |
+ *   | ...                |
+ *   | ...                |
+ *   |--------------------|
+ *   | region nregs-1     |
+ *   \--------------------/
+ */
+typedef struct bin_info_s bin_info_t;
+struct bin_info_s {
+	/* Size of regions in a slab for this bin's size class. */
+	size_t reg_size;
+
+	/* Total size of a slab for this bin's size class. */
+	size_t slab_size;
+
+	/* Total number of regions in a slab for this bin's size class. */
+	uint32_t nregs;
+
+	/* Number of sharded bins in each arena for this size class. */
+	uint32_t n_shards;
+
+	/*
+	 * Metadata used to manipulate bitmaps for slabs associated with this
+	 * bin.
+	 */
+	bitmap_info_t bitmap_info;
+};
+
+extern bin_info_t bin_infos[SC_NBINS];
+
+void bin_info_boot(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS]);
+
+#endif /* JEMALLOC_INTERNAL_BIN_INFO_H */
--- a/include/jemalloc/internal/bin_inlines.h
+++ b/include/jemalloc/internal/bin_inlines.h
@ -0,0 +1,112 @@
+#ifndef JEMALLOC_INTERNAL_BIN_INLINES_H
+#define JEMALLOC_INTERNAL_BIN_INLINES_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/bin_info.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * The dalloc bin info contains just the information that the common paths need
+ * during tcache flushes.  By force-inlining these paths, and using local copies
+ * of data (so that the compiler knows it's constant), we avoid a whole bunch of
+ * redundant loads and stores by leaving this information in registers.
+ */
+typedef struct bin_dalloc_locked_info_s bin_dalloc_locked_info_t;
+struct bin_dalloc_locked_info_s {
+	div_info_t div_info;
+	uint32_t   nregs;
+	uint64_t   ndalloc;
+};
+
+/* Find the region index of a pointer within a slab. */
+JEMALLOC_ALWAYS_INLINE size_t
+bin_slab_regind_impl(
+    div_info_t *div_info, szind_t binind, edata_t *slab, const void *ptr) {
+	size_t diff, regind;
+
+	/* Freeing a pointer outside the slab can cause assertion failure. */
+	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
+	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
+	/* Freeing an interior pointer can cause assertion failure. */
+	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab))
+	        % (uintptr_t)bin_infos[binind].reg_size
+	    == 0);
+
+	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
+
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(div_info, diff);
+	assert(regind < bin_infos[binind].nregs);
+	return regind;
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+bin_slab_regind(bin_dalloc_locked_info_t *info, szind_t binind,
+    edata_t *slab, const void *ptr) {
+	size_t regind = bin_slab_regind_impl(
+	    &info->div_info, binind, slab, ptr);
+	return regind;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_dalloc_locked_begin(
+    bin_dalloc_locked_info_t *info, szind_t binind) {
+	info->div_info = arena_binind_div_info[binind];
+	info->nregs = bin_infos[binind].nregs;
+	info->ndalloc = 0;
+}
+
+/*
+ * Does the deallocation work associated with freeing a single pointer (a
+ * "step") in between a bin_dalloc_locked begin and end call.
+ *
+ * Returns true if arena_slab_dalloc must be called on slab.  Doesn't do
+ * stats updates, which happen during finish (this lets running counts get left
+ * in a register).
+ */
+JEMALLOC_ALWAYS_INLINE bool
+bin_dalloc_locked_step(tsdn_t *tsdn, bool is_auto, bin_t *bin,
+    bin_dalloc_locked_info_t *info, szind_t binind, edata_t *slab,
+    void *ptr) {
+	const bin_info_t *bin_info = &bin_infos[binind];
+	size_t            regind = bin_slab_regind(info, binind, slab, ptr);
+	slab_data_t      *slab_data = edata_slab_data_get(slab);
+
+	assert(edata_nfree_get(slab) < bin_info->nregs);
+	/* Freeing an unallocated pointer can cause assertion failure. */
+	assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
+
+	bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
+	edata_nfree_inc(slab);
+
+	if (config_stats) {
+		info->ndalloc++;
+	}
+
+	unsigned nfree = edata_nfree_get(slab);
+	if (nfree == bin_info->nregs) {
+		bin_dalloc_locked_handle_newly_empty(
+		    tsdn, is_auto, slab, bin);
+		return true;
+	} else if (nfree == 1 && slab != bin->slabcur) {
+		bin_dalloc_locked_handle_newly_nonempty(
+		    tsdn, is_auto, slab, bin);
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_dalloc_locked_finish(tsdn_t *tsdn, bin_t *bin,
+    bin_dalloc_locked_info_t *info) {
+	if (config_stats) {
+		bin->stats.ndalloc += info->ndalloc;
+		assert(bin->stats.curregs >= (size_t)info->ndalloc);
+		bin->stats.curregs -= (size_t)info->ndalloc;
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_INLINES_H */
--- a/include/jemalloc/internal/bin_stats.h
+++ b/include/jemalloc/internal/bin_stats.h
@ -0,0 +1,58 @@
+#ifndef JEMALLOC_INTERNAL_BIN_STATS_H
+#define JEMALLOC_INTERNAL_BIN_STATS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex_prof.h"
+
+typedef struct bin_stats_s bin_stats_t;
+struct bin_stats_s {
+	/*
+	 * Total number of allocation/deallocation requests served directly by
+	 * the bin.  Note that tcache may allocate an object, then recycle it
+	 * many times, resulting many increments to nrequests, but only one
+	 * each to nmalloc and ndalloc.
+	 */
+	uint64_t nmalloc;
+	uint64_t ndalloc;
+
+	/*
+	 * Number of allocation requests that correspond to the size of this
+	 * bin.  This includes requests served by tcache, though tcache only
+	 * periodically merges into this counter.
+	 */
+	uint64_t nrequests;
+
+	/*
+	 * Current number of regions of this size class, including regions
+	 * currently cached by tcache.
+	 */
+	size_t curregs;
+
+	/* Number of tcache fills from this bin. */
+	uint64_t nfills;
+
+	/* Number of tcache flushes to this bin. */
+	uint64_t nflushes;
+
+	/* Total number of slabs created for this bin's size class. */
+	uint64_t nslabs;
+
+	/*
+	 * Total number of slabs reused by extracting them from the slabs heap
+	 * for this bin's size class.
+	 */
+	uint64_t reslabs;
+
+	/* Current number of slabs in this bin. */
+	size_t curslabs;
+
+	/* Current size of nonfull slabs heap in this bin. */
+	size_t nonfull_slabs;
+};
+
+typedef struct bin_stats_data_s bin_stats_data_t;
+struct bin_stats_data_s {
+	bin_stats_t       stats_data;
+	mutex_prof_data_t mutex_data;
+};
+#endif /* JEMALLOC_INTERNAL_BIN_STATS_H */
--- a/include/jemalloc/internal/bin_types.h
+++ b/include/jemalloc/internal/bin_types.h
@ -0,0 +1,21 @@
+#ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
+#define JEMALLOC_INTERNAL_BIN_TYPES_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/sc.h"
+
+#define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH)
+#define N_BIN_SHARDS_DEFAULT 1
+
+/* Used in TSD static initializer only. Real init in arena_bind(). */
+#define TSD_BINSHARDS_ZERO_INITIALIZER                                         \
+	{                                                                      \
+		{ UINT8_MAX }                                                  \
+	}
+
+typedef struct tsd_binshards_s tsd_binshards_t;
+struct tsd_binshards_s {
+	uint8_t binshard[SC_NBINS];
+};
+
+#endif /* JEMALLOC_INTERNAL_BIN_TYPES_H */
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@ -0,0 +1,431 @@
+#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
+#define JEMALLOC_INTERNAL_BIT_UTIL_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
+/* Sanity check. */
+#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL)      \
+    || !defined(JEMALLOC_INTERNAL_FFS)
+#	error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
+#endif
+
+/*
+ * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
+ * input, and returns the position of the lowest bit set (as opposed to the
+ * posix versions, which return 1 larger than that position and use a return
+ * value of zero as a sentinel.  This tends to simplify logic in callers, and
+ * allows for consistency with the builtins we build fls on top of.
+ */
+static inline unsigned
+ffs_llu(unsigned long long x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFSLL(x) - 1;
+}
+
+static inline unsigned
+ffs_lu(unsigned long x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFSL(x) - 1;
+}
+
+static inline unsigned
+ffs_u(unsigned x) {
+	util_assume(x != 0);
+	return JEMALLOC_INTERNAL_FFS(x) - 1;
+}
+
+/* clang-format off */
+#define DO_FLS_SLOW(x, suffix) do {					\
+	util_assume(x != 0);						\
+	x |= (x >> 1);							\
+	x |= (x >> 2);							\
+	x |= (x >> 4);							\
+	x |= (x >> 8);							\
+	x |= (x >> 16);							\
+	if (sizeof(x) > 4) {						\
+		/*							\
+		 * If sizeof(x) is 4, then the expression "x >> 32"	\
+		 * will generate compiler warnings even if the code	\
+		 * never executes.  This circumvents the warning, and	\
+		 * gets compiled out in optimized builds.		\
+		 */							\
+		int constant_32 = sizeof(x) * 4;			\
+		x |= (x >> constant_32);				\
+	}								\
+	x++;								\
+	if (x == 0) {							\
+		return 8 * sizeof(x) - 1;				\
+	}								\
+	return ffs_##suffix(x) - 1;					\
+} while(0)
+/* clang-format on */
+
+static inline unsigned
+fls_llu_slow(unsigned long long x) {
+	DO_FLS_SLOW(x, llu);
+}
+
+static inline unsigned
+fls_lu_slow(unsigned long x) {
+	DO_FLS_SLOW(x, lu);
+}
+
+static inline unsigned
+fls_u_slow(unsigned x) {
+	DO_FLS_SLOW(x, u);
+}
+
+#undef DO_FLS_SLOW
+
+#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
+static inline unsigned
+fls_llu(unsigned long long x) {
+	util_assume(x != 0);
+	/*
+	 * Note that the xor here is more naturally written as subtraction; the
+	 * last bit set is the number of bits in the type minus the number of
+	 * leading zero bits.  But GCC implements that as:
+	 *    bsr     edi, edi
+	 *    mov     eax, 31
+	 *    xor     edi, 31
+	 *    sub     eax, edi
+	 * If we write it as xor instead, then we get
+	 *    bsr     eax, edi
+	 * as desired.
+	 */
+	return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	util_assume(x != 0);
+	return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
+}
+#elif defined(_MSC_VER)
+
+#	if LG_SIZEOF_PTR == 3
+#		define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#	else
+/*
+ * This never actually runs; we're just dodging a compiler error for the
+ * never-taken branch where sizeof(void *) == 8.
+ */
+#		define DO_BSR64(bit, x)                                       \
+			bit = 0;                                               \
+			unreachable()
+#	endif
+
+/* clang-format off */
+#define DO_FLS(x) do {							\
+	if (x == 0) {							\
+		return 8 * sizeof(x);					\
+	}								\
+	unsigned long bit;						\
+	if (sizeof(x) == 4) {						\
+		_BitScanReverse(&bit, (unsigned)x);			\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 8) {			\
+		DO_BSR64(bit, x);					\
+		return (unsigned)bit;					\
+	}								\
+	if (sizeof(x) == 8 && sizeof(void *) == 4) {			\
+		/* Dodge a compiler warning, as above. */		\
+		int constant_32 = sizeof(x) * 4;			\
+		if (_BitScanReverse(&bit,				\
+		    (unsigned)(x >> constant_32))) {			\
+			return 32 + (unsigned)bit;			\
+		} else {						\
+			_BitScanReverse(&bit, (unsigned)x);		\
+			return (unsigned)bit;				\
+		}							\
+	}								\
+	unreachable();							\
+} while (0)
+/* clang-format on */
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	DO_FLS(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	DO_FLS(x);
+}
+
+#	undef DO_FLS
+#	undef DO_BSR64
+#else
+
+static inline unsigned
+fls_llu(unsigned long long x) {
+	return fls_llu_slow(x);
+}
+
+static inline unsigned
+fls_lu(unsigned long x) {
+	return fls_lu_slow(x);
+}
+
+static inline unsigned
+fls_u(unsigned x) {
+	return fls_u_slow(x);
+}
+#endif
+
+#if LG_SIZEOF_LONG_LONG > 3
+#	error "Haven't implemented popcount for 16-byte ints."
+#endif
+
+/* clang-format off */
+#define DO_POPCOUNT(x, type) do {					\
+	/*								\
+	 * Algorithm from an old AMD optimization reference manual.	\
+	 * We're putting a little bit more work than you might expect	\
+	 * into the no-instrinsic case, since we only support the	\
+	 * GCC intrinsics spelling of popcount (for now).  Detecting	\
+	 * whether or not the popcount builtin is actually useable in	\
+	 * MSVC is nontrivial.						\
+	 */								\
+									\
+	type bmul = (type)0x0101010101010101ULL;			\
+									\
+	/*								\
+	 * Replace each 2 bits with the sideways sum of the original	\
+	 * values.  0x5 = 0b0101.					\
+	 *								\
+	 * You might expect this to be:					\
+	 *   x = (x & 0x55...) + ((x >> 1) & 0x55...).			\
+	 * That costs an extra mask relative to this, though.		\
+	 */								\
+	x = x - ((x >> 1) & (0x55U * bmul));				\
+	/* Replace each 4 bits with their sideays sum.  0x3 = 0b0011. */\
+	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U));		\
+	/*								\
+	 * Replace each 8 bits with their sideways sum.  Note that we	\
+	 * can't overflow within each 4-bit sum here, so we can skip	\
+	 * the initial mask.						\
+	 */								\
+	x = (x + (x >> 4)) & (bmul * 0x0FU);				\
+	/*								\
+	 * None of the partial sums in this multiplication (viewed in	\
+	 * base-256) can overflow into the next digit.  So the least	\
+	 * significant byte of the product will be the least		\
+	 * significant byte of the original value, the second least	\
+	 * significant byte will be the sum of the two least		\
+	 * significant bytes of the original value, and so on.		\
+	 * Importantly, the high byte will be the byte-wise sum of all	\
+	 * the bytes of the original value.				\
+	 */								\
+	x = x * bmul;							\
+	x >>= ((sizeof(x) - 1) * 8);					\
+	return (unsigned)x;						\
+} while(0)
+/* clang-format on */
+
+static inline unsigned
+popcount_u_slow(unsigned bitmap) {
+	DO_POPCOUNT(bitmap, unsigned);
+}
+
+static inline unsigned
+popcount_lu_slow(unsigned long bitmap) {
+	DO_POPCOUNT(bitmap, unsigned long);
+}
+
+static inline unsigned
+popcount_llu_slow(unsigned long long bitmap) {
+	DO_POPCOUNT(bitmap, unsigned long long);
+}
+
+#undef DO_POPCOUNT
+
+static inline unsigned
+popcount_u(unsigned bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNT
+	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
+#else
+	return popcount_u_slow(bitmap);
+#endif
+}
+
+static inline unsigned
+popcount_lu(unsigned long bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNTL
+	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
+#else
+	return popcount_lu_slow(bitmap);
+#endif
+}
+
+static inline unsigned
+popcount_llu(unsigned long long bitmap) {
+#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
+	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
+#else
+	return popcount_llu_slow(bitmap);
+#endif
+}
+
+/*
+ * Clears first unset bit in bitmap, and returns
+ * place of bit.  bitmap *must not* be 0.
+ */
+
+static inline size_t
+cfs_lu(unsigned long *bitmap) {
+	util_assume(*bitmap != 0);
+	size_t bit = ffs_lu(*bitmap);
+	*bitmap ^= ZU(1) << bit;
+	return bit;
+}
+
+static inline unsigned
+ffs_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return ffs_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return ffs_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return ffs_llu(x);
+#else
+#	error No implementation for size_t ffs()
+#endif
+}
+
+static inline unsigned
+fls_zu(size_t x) {
+#if LG_SIZEOF_PTR == LG_SIZEOF_INT
+	return fls_u(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
+	return fls_lu(x);
+#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
+	return fls_llu(x);
+#else
+#	error No implementation for size_t fls()
+#endif
+}
+
+static inline unsigned
+ffs_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return ffs_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return ffs_llu(x);
+#else
+#	error No implementation for 64-bit ffs()
+#endif
+}
+
+static inline unsigned
+fls_u64(uint64_t x) {
+#if LG_SIZEOF_LONG == 3
+	return fls_lu(x);
+#elif LG_SIZEOF_LONG_LONG == 3
+	return fls_llu(x);
+#else
+#	error No implementation for 64-bit fls()
+#endif
+}
+
+static inline unsigned
+ffs_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return ffs_u(x);
+#else
+#	error No implementation for 32-bit ffs()
+#endif
+}
+
+static inline unsigned
+fls_u32(uint32_t x) {
+#if LG_SIZEOF_INT == 2
+	return fls_u(x);
+#else
+#	error No implementation for 32-bit fls()
+#endif
+}
+
+static inline uint64_t
+pow2_ceil_u64(uint64_t x) {
+	if (unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index = fls_u64(x - 1);
+	/*
+	 * Range-check; it's on the callers to ensure that the result of this
+	 * call won't overflow.
+	 */
+	assert(msb_on_index < 63);
+	return 1ULL << (msb_on_index + 1);
+}
+
+static inline uint32_t
+pow2_ceil_u32(uint32_t x) {
+	if (unlikely(x <= 1)) {
+		return x;
+	}
+	size_t msb_on_index = fls_u32(x - 1);
+	/* As above. */
+	assert(msb_on_index < 31);
+	return 1U << (msb_on_index + 1);
+}
+
+/* Compute the smallest power of 2 that is >= x. */
+static inline size_t
+pow2_ceil_zu(size_t x) {
+#if (LG_SIZEOF_PTR == 3)
+	return pow2_ceil_u64(x);
+#else
+	return pow2_ceil_u32(x);
+#endif
+}
+
+static inline unsigned
+lg_floor(size_t x) {
+	util_assume(x != 0);
+#if (LG_SIZEOF_PTR == 3)
+	return fls_u64(x);
+#else
+	return fls_u32(x);
+#endif
+}
+
+static inline unsigned
+lg_ceil(size_t x) {
+	return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
+}
+
+/* A compile-time version of lg_floor and lg_ceil. */
+#define LG_FLOOR_1(x) 0
+#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
+#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
+#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
+#define LG_FLOOR_16(x)                                                         \
+	(x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x)                                                         \
+	(x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x)                                                         \
+	(x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#if LG_SIZEOF_PTR == 2
+#	define LG_FLOOR(x) LG_FLOOR_32((x))
+#else
+#	define LG_FLOOR(x) LG_FLOOR_64((x))
+#endif
+
+#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
+
+#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@ -1,19 +1,27 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
+#ifndef JEMALLOC_INTERNAL_BITMAP_H
+#define JEMALLOC_INTERNAL_BITMAP_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/sc.h"
+
+typedef unsigned long bitmap_t;
+#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG

 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
-#define	LG_BITMAP_MAXBITS	LG_RUN_MAXREGS
-#define	BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
-
-typedef struct bitmap_level_s bitmap_level_t;
-typedef struct bitmap_info_s bitmap_info_t;
-typedef unsigned long bitmap_t;
-#define	LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+#if SC_LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
+/* Maximum bitmap bit count is determined by maximum regions per slab. */
+#	define LG_BITMAP_MAXBITS SC_LG_SLAB_MAXREGS
+#else
+/* Maximum bitmap bit count is determined by number of extent size classes. */
+#	define LG_BITMAP_MAXBITS LG_CEIL(SC_NSIZES)
+#endif
+#define BITMAP_MAXBITS (ZU(1) << LG_BITMAP_MAXBITS)

 /* Number of bits per group. */
-#define	LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
-#define	BITMAP_GROUP_NBITS		(ZU(1) << LG_BITMAP_GROUP_NBITS)
-#define	BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3)
+#define BITMAP_GROUP_NBITS (1U << LG_BITMAP_GROUP_NBITS)
+#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS - 1)

 /*
 * Do some analysis on how big the bitmap is before we use a tree.  For a brute
@ -21,81 +29,139 @@ typedef unsigned long bitmap_t;
 * use a tree instead.
 */
 #if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3
-#  define USE_TREE
+#	define BITMAP_USE_TREE
 #endif

 /* Number of groups required to store a given number of bits. */
-#define	BITMAP_BITS2GROUPS(nbits)					\
-    ((nbits + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+#define BITMAP_BITS2GROUPS(nbits)                                              \
+	(((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)

 /*
 * Number of groups required at a particular level for a given number of bits.
 */
-#define	BITMAP_GROUPS_L0(nbits)						\
-    BITMAP_BITS2GROUPS(nbits)
-#define	BITMAP_GROUPS_L1(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
-#define	BITMAP_GROUPS_L2(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
-#define	BITMAP_GROUPS_L3(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
-	BITMAP_BITS2GROUPS((nbits)))))
+#define BITMAP_GROUPS_L0(nbits) BITMAP_BITS2GROUPS(nbits)
+#define BITMAP_GROUPS_L1(nbits) BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define BITMAP_GROUPS_L2(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define BITMAP_GROUPS_L3(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(                                 \
+	    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))))
+#define BITMAP_GROUPS_L4(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(              \
+	    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))))

 /*
 * Assuming the number of levels, number of groups required for a given number
 * of bits.
 */
-#define	BITMAP_GROUPS_1_LEVEL(nbits)					\
-    BITMAP_GROUPS_L0(nbits)
-#define	BITMAP_GROUPS_2_LEVEL(nbits)					\
-    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
-#define	BITMAP_GROUPS_3_LEVEL(nbits)					\
-    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
-#define	BITMAP_GROUPS_4_LEVEL(nbits)					\
-    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+#define BITMAP_GROUPS_1_LEVEL(nbits) BITMAP_GROUPS_L0(nbits)
+#define BITMAP_GROUPS_2_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define BITMAP_GROUPS_3_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define BITMAP_GROUPS_4_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+#define BITMAP_GROUPS_5_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits))

 /*
 * Maximum number of groups required to support LG_BITMAP_MAXBITS.
 */
-#ifdef USE_TREE
+#ifdef BITMAP_USE_TREE

-#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
-#else
-#  error "Unsupported bitmap size"
-#endif
+#	if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_1_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_2_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_3_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_4_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_5_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS)
+#	else
+#		error "Unsupported bitmap size"
+#	endif

-/* Maximum number of levels possible. */
-#define	BITMAP_MAX_LEVELS						\
-    (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP)				\
-    + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+/*
+ * Maximum number of levels possible.  This could be statically computed based
+ * on LG_BITMAP_MAXBITS:
+ *
+ * #define BITMAP_MAX_LEVELS \
+ *     (LG_BITMAP_MAXBITS / LG_SIZEOF_BITMAP) \
+ *     + !!(LG_BITMAP_MAXBITS % LG_SIZEOF_BITMAP)
+ *
+ * However, that would not allow the generic BITMAP_INFO_INITIALIZER() macro, so
+ * instead hardcode BITMAP_MAX_LEVELS to the largest number supported by the
+ * various cascading macros.  The only additional cost this incurs is some
+ * unused trailing entries in bitmap_info_t structures; the bitmaps themselves
+ * are not impacted.
+ */
+#	define BITMAP_MAX_LEVELS 5

-#else /* USE_TREE */
+#	define BITMAP_INFO_INITIALIZER(nbits)                                 \
+		{                                                              \
+			/* nbits. */                                           \
+			nbits, /* nlevels. */                                  \
+			    (BITMAP_GROUPS_L0(nbits)                           \
+			        > BITMAP_GROUPS_L1(nbits))                     \
+			    + (BITMAP_GROUPS_L1(nbits)                         \
+			        > BITMAP_GROUPS_L2(nbits))                     \
+			    + (BITMAP_GROUPS_L2(nbits)                         \
+			        > BITMAP_GROUPS_L3(nbits))                     \
+			    + (BITMAP_GROUPS_L3(nbits)                         \
+			        > BITMAP_GROUPS_L4(nbits))                     \
+			    + 1, /* levels. */                                 \
+			{                                                      \
+				{0}, {BITMAP_GROUPS_L0(nbits)},                \
+				    {BITMAP_GROUPS_L1(nbits)                   \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				    {BITMAP_GROUPS_L2(nbits)                   \
+				        + BITMAP_GROUPS_L1(nbits)              \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				    {BITMAP_GROUPS_L3(nbits)                   \
+				        + BITMAP_GROUPS_L2(nbits)              \
+				        + BITMAP_GROUPS_L1(nbits)              \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				{                                              \
+					BITMAP_GROUPS_L4(nbits)                \
+					    + BITMAP_GROUPS_L3(nbits)          \
+					    + BITMAP_GROUPS_L2(nbits)          \
+					    + BITMAP_GROUPS_L1(nbits)          \
+					    + BITMAP_GROUPS_L0(nbits)          \
+				}                                              \
+			}                                                      \
+		}

-#define	BITMAP_GROUPS_MAX BITMAP_BITS2GROUPS(BITMAP_MAXBITS)
+#else /* BITMAP_USE_TREE */

-#endif /* USE_TREE */
+#	define BITMAP_GROUPS(nbits) BITMAP_BITS2GROUPS(nbits)
+#	define BITMAP_GROUPS_MAX BITMAP_BITS2GROUPS(BITMAP_MAXBITS)

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+#	define BITMAP_INFO_INITIALIZER(nbits)                                 \
+		{                                                              \
+			/* nbits. */                                           \
+			nbits, /* ngroups. */                                  \
+			    BITMAP_BITS2GROUPS(nbits)                          \
+		}

-struct bitmap_level_s {
+#endif /* BITMAP_USE_TREE */
+
+typedef struct bitmap_level_s {
 	/* Offset of this level's groups within the array of groups. */
 	size_t group_offset;
-};
+} bitmap_level_t;

-struct bitmap_info_s {
+typedef struct bitmap_info_s {
 	/* Logical number of bits in bitmap (stored at bottom level). */
 	size_t nbits;

-#ifdef USE_TREE
+#ifdef BITMAP_USE_TREE
 	/* Number of levels necessary for nbits. */
 	unsigned nlevels;

@ -103,39 +169,21 @@ struct bitmap_info_s {
 	 * Only the first (nlevels+1) elements are used, and levels are ordered
 	 * bottom to top (e.g. the bottom level is stored in levels[0]).
 	 */
-	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
-#else /* USE_TREE */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS + 1];
+#else  /* BITMAP_USE_TREE */
 	/* Number of groups necessary for nbits. */
 	size_t ngroups;
-#endif /* USE_TREE */
-};
+#endif /* BITMAP_USE_TREE */
+} bitmap_info_t;

-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
+void   bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+void   bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill);
+size_t bitmap_size(const bitmap_info_t *binfo);

-void	bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
-void	bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo);
-size_t	bitmap_size(const bitmap_info_t *binfo);
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#ifndef JEMALLOC_ENABLE_INLINE
-bool	bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo);
-bool	bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
-void	bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
-size_t	bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo);
-void	bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_BITMAP_C_))
-JEMALLOC_INLINE bool
-bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
-{
-#ifdef USE_TREE
-	size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+static inline bool
+bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {
+#ifdef BITMAP_USE_TREE
+	size_t   rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
 	bitmap_t rg = bitmap[rgoff];
 	/* The bitmap is full iff the root group is 0. */
 	return (rg == 0);
@ -143,31 +191,30 @@ bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo)
 	size_t i;

 	for (i = 0; i < binfo->ngroups; i++) {
-		if (bitmap[i] != 0)
-			return (false);
+		if (bitmap[i] != 0) {
+			return false;
+		}
 	}
-	return (true);
+	return true;
 #endif
 }

-JEMALLOC_INLINE bool
-bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
-{
-	size_t goff;
+static inline bool
+bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t   goff;
 	bitmap_t g;

 	assert(bit < binfo->nbits);
 	goff = bit >> LG_BITMAP_GROUP_NBITS;
 	g = bitmap[goff];
-	return (!(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK))));
+	return !(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
 }

-JEMALLOC_INLINE void
-bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
-{
-	size_t goff;
+static inline void
+bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t    goff;
 	bitmap_t *gp;
-	bitmap_t g;
+	bitmap_t  g;

 	assert(bit < binfo->nbits);
 	assert(!bitmap_get(bitmap, binfo, bit));
@ -178,7 +225,7 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
 	assert(bitmap_get(bitmap, binfo, bit));
-#ifdef USE_TREE
+#ifdef BITMAP_USE_TREE
 	/* Propagate group state transitions up the tree. */
 	if (g == 0) {
 		unsigned i;
@ -190,51 +237,113 @@ bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			assert(g & (ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK)));
 			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (g != 0)
+			if (g != 0) {
 				break;
+			}
 		}
 	}
 #endif
 }

-/* sfu: set first unset. */
-JEMALLOC_INLINE size_t
-bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo)
-{
+/* ffu: find first unset >= bit. */
+static inline size_t
+bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
+	assert(min_bit < binfo->nbits);
+
+#ifdef BITMAP_USE_TREE
+	size_t bit = 0;
+	for (unsigned level = binfo->nlevels; level--;) {
+		size_t   lg_bits_per_group = (LG_BITMAP_GROUP_NBITS
+                    * (level + 1));
+		bitmap_t group = bitmap[binfo->levels[level].group_offset
+		    + (bit >> lg_bits_per_group)];
+		unsigned group_nmask =
+		    (unsigned)(((min_bit > bit) ? (min_bit - bit) : 0)
+		        >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS));
+		assert(group_nmask <= BITMAP_GROUP_NBITS);
+		bitmap_t group_mask = ~((1LU << group_nmask) - 1);
+		bitmap_t group_masked = group & group_mask;
+		if (group_masked == 0LU) {
+			if (group == 0LU) {
+				return binfo->nbits;
+			}
+			/*
+			 * min_bit was preceded by one or more unset bits in
+			 * this group, but there are no other unset bits in this
+			 * group.  Try again starting at the first bit of the
+			 * next sibling.  This will recurse at most once per
+			 * non-root level.
+			 */
+			size_t sib_base = bit + (ZU(1) << lg_bits_per_group);
+			assert(sib_base > min_bit);
+			assert(sib_base > bit);
+			if (sib_base >= binfo->nbits) {
+				return binfo->nbits;
+			}
+			return bitmap_ffu(bitmap, binfo, sib_base);
+		}
+		bit += ((size_t)ffs_lu(group_masked))
+		    << (lg_bits_per_group - LG_BITMAP_GROUP_NBITS);
+	}
+	assert(bit >= min_bit);
+	assert(bit < binfo->nbits);
+	return bit;
+#else
+	size_t   i = min_bit >> LG_BITMAP_GROUP_NBITS;
+	bitmap_t g = bitmap[i]
+	    & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK)) - 1);
 	size_t bit;
+	while (1) {
+		if (g != 0) {
+			bit = ffs_lu(g);
+			return (i << LG_BITMAP_GROUP_NBITS) + bit;
+		}
+		i++;
+		if (i >= binfo->ngroups) {
+			break;
+		}
+		g = bitmap[i];
+	}
+	return binfo->nbits;
+#endif
+}
+
+/* sfu: set first unset. */
+static inline size_t
+bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {
+	size_t   bit;
 	bitmap_t g;
 	unsigned i;

 	assert(!bitmap_full(bitmap, binfo));

-#ifdef USE_TREE
+#ifdef BITMAP_USE_TREE
 	i = binfo->nlevels - 1;
 	g = bitmap[binfo->levels[i].group_offset];
-	bit = ffs_lu(g) - 1;
+	bit = ffs_lu(g);
 	while (i > 0) {
 		i--;
 		g = bitmap[binfo->levels[i].group_offset + bit];
-		bit = (bit << LG_BITMAP_GROUP_NBITS) + (ffs_lu(g) - 1);
+		bit = (bit << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
 	}
 #else
 	i = 0;
 	g = bitmap[0];
-	while ((bit = ffs_lu(g)) == 0) {
+	while (g == 0) {
 		i++;
 		g = bitmap[i];
 	}
-	bit = (i << LG_BITMAP_GROUP_NBITS) + (bit - 1);
+	bit = (i << LG_BITMAP_GROUP_NBITS) + ffs_lu(g);
 #endif
 	bitmap_set(bitmap, binfo, bit);
-	return (bit);
+	return bit;
 }

-JEMALLOC_INLINE void
-bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
-{
-	size_t goff;
-	bitmap_t *gp;
-	bitmap_t g;
+static inline void
+bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
+	size_t      goff;
+	bitmap_t   *gp;
+	bitmap_t    g;
 	UNUSED bool propagate;

 	assert(bit < binfo->nbits);
@ -247,7 +356,7 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 	g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 	*gp = g;
 	assert(!bitmap_get(bitmap, binfo, bit));
-#ifdef USE_TREE
+#ifdef BITMAP_USE_TREE
 	/* Propagate group state transitions up the tree. */
 	if (propagate) {
 		unsigned i;
@ -261,14 +370,12 @@ bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit)
 			    == 0);
 			g ^= ZU(1) << (bit & BITMAP_GROUP_NBITS_MASK);
 			*gp = g;
-			if (!propagate)
+			if (!propagate) {
 				break;
+			}
 		}
 	}
-#endif /* USE_TREE */
+#endif /* BITMAP_USE_TREE */
 }

-#endif
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+#endif /* JEMALLOC_INTERNAL_BITMAP_H */
--- a/include/jemalloc/internal/buf_writer.h
+++ b/include/jemalloc/internal/buf_writer.h
@ -0,0 +1,36 @@
+#ifndef JEMALLOC_INTERNAL_BUF_WRITER_H
+#define JEMALLOC_INTERNAL_BUF_WRITER_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/*
+ * Note: when using the buffered writer, cbopaque is passed to write_cb only
+ * when the buffer is flushed.  It would make a difference if cbopaque points
+ * to something that's changing for each write_cb call, or something that
+ * affects write_cb in a way dependent on the content of the output string.
+ * However, the most typical usage case in practice is that cbopaque points to
+ * some "option like" content for the write_cb, so it doesn't matter.
+ */
+
+typedef struct {
+	write_cb_t *write_cb;
+	void       *cbopaque;
+	char       *buf;
+	size_t      buf_size;
+	size_t      buf_end;
+	bool        internal_buf;
+} buf_writer_t;
+
+bool       buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer,
+          write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len);
+void       buf_writer_flush(buf_writer_t *buf_writer);
+write_cb_t buf_writer_cb;
+void       buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer);
+
+typedef ssize_t(read_cb_t)(void *read_cbopaque, void *buf, size_t limit);
+void buf_writer_pipe(
+    buf_writer_t *buf_writer, read_cb_t *read_cb, void *read_cbopaque);
+
+#endif /* JEMALLOC_INTERNAL_BUF_WRITER_H */
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@ -0,0 +1,777 @@
+#ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
+#define JEMALLOC_INTERNAL_CACHE_BIN_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/safety_check.h"
+#include "jemalloc/internal/sz.h"
+
+/*
+ * The cache_bins are the mechanism that the tcache and the arena use to
+ * communicate.  The tcache fills from and flushes to the arena by passing a
+ * cache_bin_t to fill/flush.  When the arena needs to pull stats from the
+ * tcaches associated with it, it does so by iterating over its
+ * cache_bin_array_descriptor_t objects and reading out per-bin stats it
+ * contains.  This makes it so that the arena need not know about the existence
+ * of the tcache at all.
+ */
+
+/*
+ * The size in bytes of each cache bin stack.  We also use this to indicate
+ * *counts* of individual objects.
+ */
+typedef uint16_t cache_bin_sz_t;
+
+#define JUNK_ADDR ((uintptr_t)0x7a7a7a7a7a7a7a7aULL)
+/*
+ * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a
+ * bug starts leaking those.  Make it look like the junk pattern but be distinct
+ * from it.
+ */
+static const uintptr_t cache_bin_preceding_junk = JUNK_ADDR;
+/* Note: JUNK_ADDR vs. JUNK_ADDR + 1 -- this tells you which pointer leaked. */
+static const uintptr_t cache_bin_trailing_junk = JUNK_ADDR + 1;
+/*
+ * A pointer used to initialize a fake stack_head for disabled small bins
+ * so that the enabled/disabled assessment does not rely on ncached_max.
+ */
+extern const uintptr_t disabled_bin;
+
+/*
+ * That implies the following value, for the maximum number of items in any
+ * individual bin.  The cache bins track their bounds looking just at the low
+ * bits of a pointer, compared against a cache_bin_sz_t.  So that's
+ *   1 << (sizeof(cache_bin_sz_t) * 8)
+ * bytes spread across pointer sized objects to get the maximum.
+ */
+#define CACHE_BIN_NCACHED_MAX                                                  \
+	(((size_t)1 << sizeof(cache_bin_sz_t) * 8) / sizeof(void *) - 1)
+
+/*
+ * This lives inside the cache_bin (for locality reasons), and is initialized
+ * alongside it, but is otherwise not modified by any cache bin operations.
+ * It's logically public and maintained by its callers.
+ */
+typedef struct cache_bin_stats_s cache_bin_stats_t;
+struct cache_bin_stats_s {
+	/*
+	 * Number of allocation requests that corresponded to the size of this
+	 * bin.
+	 */
+	uint64_t nrequests;
+};
+
+/*
+ * Read-only information associated with each element of tcache_t's tbins array
+ * is stored separately, mainly to reduce memory usage.
+ */
+typedef struct cache_bin_info_s cache_bin_info_t;
+struct cache_bin_info_s {
+	cache_bin_sz_t ncached_max;
+};
+
+/*
+ * Responsible for caching allocations associated with a single size.
+ *
+ * Several pointers are used to track the stack.  To save on metadata bytes,
+ * only the stack_head is a full sized pointer (which is dereferenced on the
+ * fastpath), while the others store only the low 16 bits -- this is correct
+ * because a single stack never takes more space than 2^16 bytes, and at the
+ * same time only equality checks are performed on the low bits.
+ *
+ * (low addr)                                                  (high addr)
+ * |------stashed------|------available------|------cached-----|
+ * ^                   ^                     ^                 ^
+ * low_bound(derived)  low_bits_full         stack_head        low_bits_empty
+ */
+typedef struct cache_bin_s cache_bin_t;
+struct cache_bin_s {
+	/*
+	 * The stack grows down.  Whenever the bin is nonempty, the head points
+	 * to an array entry containing a valid allocation.  When it is empty,
+	 * the head points to one element past the owned array.
+	 */
+	void **stack_head;
+	/*
+	 * cur_ptr and stats are both modified frequently.  Let's keep them
+	 * close so that they have a higher chance of being on the same
+	 * cacheline, thus less write-backs.
+	 */
+	cache_bin_stats_t tstats;
+
+	/*
+	 * The low bits of the address of the first item in the stack that
+	 * hasn't been used since the last GC, to track the low water mark (min
+	 * # of cached items).
+	 *
+	 * Since the stack grows down, this is a higher address than
+	 * low_bits_full.
+	 */
+	cache_bin_sz_t low_bits_low_water;
+
+	/*
+	 * The low bits of the value that stack_head will take on when the array
+	 * is full (of cached & stashed items).  But remember that stack_head
+	 * always points to a valid item when the array is nonempty -- this is
+	 * in the array.
+	 *
+	 * Recall that since the stack grows down, this is the lowest available
+	 * address in the array for caching.  Only adjusted when stashing items.
+	 */
+	cache_bin_sz_t low_bits_full;
+
+	/*
+	 * The low bits of the value that stack_head will take on when the array
+	 * is empty.
+	 *
+	 * The stack grows down -- this is one past the highest address in the
+	 * array.  Immutable after initialization.
+	 */
+	cache_bin_sz_t low_bits_empty;
+
+	/* The maximum number of cached items in the bin. */
+	cache_bin_info_t bin_info;
+};
+
+/*
+ * The cache_bins live inside the tcache, but the arena (by design) isn't
+ * supposed to know much about tcache internals.  To let the arena iterate over
+ * associated bins, we keep (with the tcache) a linked list of
+ * cache_bin_array_descriptor_ts that tell the arena how to find the bins.
+ */
+typedef struct cache_bin_array_descriptor_s cache_bin_array_descriptor_t;
+struct cache_bin_array_descriptor_s {
+	/*
+	 * The arena keeps a list of the cache bins associated with it, for
+	 * stats collection.
+	 */
+	ql_elm(cache_bin_array_descriptor_t) link;
+	/* Pointers to the tcache bins. */
+	cache_bin_t *bins;
+};
+
+static inline void
+cache_bin_array_descriptor_init(
+    cache_bin_array_descriptor_t *descriptor, cache_bin_t *bins) {
+	ql_elm_new(descriptor, link);
+	descriptor->bins = bins;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_nonfast_aligned(const void *ptr) {
+	if (!config_uaf_detection) {
+		return false;
+	}
+	/*
+	 * Currently we use alignment to decide which pointer to junk & stash on
+	 * dealloc (for catching use-after-free).  In some common cases a
+	 * page-aligned check is needed already (sdalloc w/ config_prof), so we
+	 * are getting it more or less for free -- no added instructions on
+	 * free_fastpath.
+	 *
+	 * Another way of deciding which pointer to sample, is adding another
+	 * thread_event to pick one every N bytes.  That also adds no cost on
+	 * the fastpath, however it will tend to pick large allocations which is
+	 * not the desired behavior.
+	 */
+	return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0;
+}
+
+static inline const void *
+cache_bin_disabled_bin_stack(void) {
+	return &disabled_bin;
+}
+
+/*
+ * If a cache bin was zero initialized (either because it lives in static or
+ * thread-local storage, or was memset to 0), this function indicates whether or
+ * not cache_bin_init was called on it.
+ */
+static inline bool
+cache_bin_still_zero_initialized(cache_bin_t *bin) {
+	return bin->stack_head == NULL;
+}
+
+static inline bool
+cache_bin_disabled(cache_bin_t *bin) {
+	bool disabled = (bin->stack_head == cache_bin_disabled_bin_stack());
+	if (disabled) {
+		assert((uintptr_t)(*bin->stack_head) == JUNK_ADDR);
+	}
+	return disabled;
+}
+
+/* Gets ncached_max without asserting that the bin is enabled. */
+static inline cache_bin_sz_t
+cache_bin_ncached_max_get_unsafe(cache_bin_t *bin) {
+	return bin->bin_info.ncached_max;
+}
+
+/* Returns ncached_max: Upper limit on ncached. */
+static inline cache_bin_sz_t
+cache_bin_ncached_max_get(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
+	return cache_bin_ncached_max_get_unsafe(bin);
+}
+
+/*
+ * Internal.
+ *
+ * Asserts that the pointer associated with earlier is <= the one associated
+ * with later.
+ */
+static inline void
+cache_bin_assert_earlier(
+    cache_bin_t *bin, cache_bin_sz_t earlier, cache_bin_sz_t later) {
+	if (earlier > later) {
+		assert(bin->low_bits_full > bin->low_bits_empty);
+	}
+}
+
+/*
+ * Internal.
+ *
+ * Does difference calculations that handle wraparound correctly.  Earlier must
+ * be associated with the position earlier in memory.
+ */
+static inline cache_bin_sz_t
+cache_bin_diff(cache_bin_t *bin, cache_bin_sz_t earlier, cache_bin_sz_t later) {
+	cache_bin_assert_earlier(bin, earlier, later);
+	return later - earlier;
+}
+
+/*
+ * Number of items currently cached in the bin, without checking ncached_max.
+ */
+static inline cache_bin_sz_t
+cache_bin_ncached_get_internal(cache_bin_t *bin) {
+	cache_bin_sz_t diff = cache_bin_diff(bin,
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
+	cache_bin_sz_t n = diff / sizeof(void *);
+	/*
+	 * We have undefined behavior here; if this function is called from the
+	 * arena stats updating code, then stack_head could change from the
+	 * first line to the next one.  Morally, these loads should be atomic,
+	 * but compilers won't currently generate comparisons with in-memory
+	 * operands against atomics, and these variables get accessed on the
+	 * fast paths.  This should still be "safe" in the sense of generating
+	 * the correct assembly for the foreseeable future, though.
+	 */
+	assert(n == 0 || *(bin->stack_head) != NULL);
+	return n;
+}
+
+/*
+ * Number of items currently cached in the bin, with checking ncached_max.  The
+ * caller must know that no concurrent modification of the cache_bin is
+ * possible.
+ */
+static inline cache_bin_sz_t
+cache_bin_ncached_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
+	return n;
+}
+
+/*
+ * Internal.
+ *
+ * A pointer to the position one past the end of the backing array.
+ *
+ * Do not call if racy, because both 'bin->stack_head' and 'bin->low_bits_full'
+ * are subject to concurrent modifications.
+ */
+static inline void **
+cache_bin_empty_position_get(cache_bin_t *bin) {
+	cache_bin_sz_t diff = cache_bin_diff(bin,
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
+	byte_t        *empty_bits = (byte_t *)bin->stack_head + diff;
+	void         **ret = (void **)empty_bits;
+
+	assert(ret >= bin->stack_head);
+
+	return ret;
+}
+
+/*
+ * Internal.
+ *
+ * Calculates low bits of the lower bound of the usable cache bin's range (see
+ * cache_bin_t visual representation above).
+ *
+ * No values are concurrently modified, so should be safe to read in a
+ * multithreaded environment. Currently concurrent access happens only during
+ * arena statistics collection.
+ */
+static inline cache_bin_sz_t
+cache_bin_low_bits_low_bound_get(cache_bin_t *bin) {
+	return (cache_bin_sz_t)bin->low_bits_empty
+	    - cache_bin_ncached_max_get(bin) * sizeof(void *);
+}
+
+/*
+ * Internal.
+ *
+ * A pointer to the position with the lowest address of the backing array.
+ */
+static inline void **
+cache_bin_low_bound_get(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
+	void         **ret = cache_bin_empty_position_get(bin) - ncached_max;
+	assert(ret <= bin->stack_head);
+
+	return ret;
+}
+
+/*
+ * As the name implies.  This is important since it's not correct to try to
+ * batch fill a nonempty cache bin.
+ */
+static inline void
+cache_bin_assert_empty(cache_bin_t *bin) {
+	assert(cache_bin_ncached_get_local(bin) == 0);
+	assert(cache_bin_empty_position_get(bin) == bin->stack_head);
+}
+
+/*
+ * Get low water, but without any of the correctness checking we do for the
+ * caller-usable version, if we are temporarily breaking invariants (like
+ * ncached >= low_water during flush).
+ */
+static inline cache_bin_sz_t
+cache_bin_low_water_get_internal(cache_bin_t *bin) {
+	return cache_bin_diff(bin, bin->low_bits_low_water, bin->low_bits_empty)
+	    / sizeof(void *);
+}
+
+/* Returns the numeric value of low water in [0, ncached]. */
+static inline cache_bin_sz_t
+cache_bin_low_water_get(cache_bin_t *bin) {
+	cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin);
+	assert(low_water <= cache_bin_ncached_max_get(bin));
+	assert(low_water <= cache_bin_ncached_get_local(bin));
+
+	cache_bin_assert_earlier(bin,
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head,
+	    bin->low_bits_low_water);
+
+	return low_water;
+}
+
+/*
+ * Indicates that the current cache bin position should be the low water mark
+ * going forward.
+ */
+static inline void
+cache_bin_low_water_set(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
+	bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)bin->stack_head;
+}
+
+static inline void
+cache_bin_low_water_adjust(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
+	if (cache_bin_ncached_get_internal(bin)
+	    < cache_bin_low_water_get_internal(bin)) {
+		cache_bin_low_water_set(bin);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
+	/*
+	 * success (instead of ret) should be checked upon the return of this
+	 * function.  We avoid checking (ret == NULL) because there is never a
+	 * null stored on the avail stack (which is unknown to the compiler),
+	 * and eagerly checking ret would cause pipeline stall (waiting for the
+	 * cacheline).
+	 */
+
+	/*
+	 * This may read from the empty position; however the loaded value won't
+	 * be used.  It's safe because the stack has one more slot reserved.
+	 */
+	void          *ret = *bin->stack_head;
+	cache_bin_sz_t low_bits = (cache_bin_sz_t)(uintptr_t)bin->stack_head;
+	void         **new_head = bin->stack_head + 1;
+
+	/*
+	 * Note that the low water mark is at most empty; if we pass this check,
+	 * we know we're non-empty.
+	 */
+	if (likely(low_bits != bin->low_bits_low_water)) {
+		bin->stack_head = new_head;
+		*success = true;
+		return ret;
+	}
+	if (!adjust_low_water) {
+		*success = false;
+		return NULL;
+	}
+	/*
+	 * In the fast-path case where we call alloc_easy and then alloc, the
+	 * previous checking and computation is optimized away -- we didn't
+	 * actually commit any of our operations.
+	 */
+	if (likely(low_bits != bin->low_bits_empty)) {
+		bin->stack_head = new_head;
+		bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)new_head;
+		*success = true;
+		return ret;
+	}
+	*success = false;
+	return NULL;
+}
+
+/*
+ * Allocate an item out of the bin, failing if we're at the low-water mark.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc_easy(cache_bin_t *bin, bool *success) {
+	/* We don't look at info if we're not adjusting low-water. */
+	return cache_bin_alloc_impl(bin, success, false);
+}
+
+/*
+ * Allocate an item out of the bin, even if we're currently at the low-water
+ * mark (and failing only if the bin is empty).
+ */
+JEMALLOC_ALWAYS_INLINE void *
+cache_bin_alloc(cache_bin_t *bin, bool *success) {
+	return cache_bin_alloc_impl(bin, success, true);
+}
+
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
+	if (n > num) {
+		n = (cache_bin_sz_t)num;
+	}
+	memcpy(out, bin->stack_head, n * sizeof(void *));
+	bin->stack_head += n;
+	cache_bin_low_water_adjust(bin);
+
+	return n;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_full(cache_bin_t *bin) {
+	return (
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head == bin->low_bits_full);
+}
+
+/*
+ * Scans the allocated area of the cache_bin for the given pointer up to limit.
+ * Fires safety_check_fail if the ptr is found and returns true.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_safety_checks(cache_bin_t *bin, void *ptr) {
+	if (!config_debug || opt_debug_double_free_max_scan == 0) {
+		return false;
+	}
+
+	cache_bin_sz_t ncached = cache_bin_ncached_get_internal(bin);
+	unsigned       max_scan = opt_debug_double_free_max_scan < ncached
+	          ? opt_debug_double_free_max_scan
+	          : ncached;
+
+	void **cur = bin->stack_head;
+	void **limit = cur + max_scan;
+	for (; cur < limit; cur++) {
+		if (*cur == ptr) {
+			safety_check_fail(
+			    "Invalid deallocation detected: double free of "
+			    "pointer %p\n",
+			    ptr);
+			return true;
+		}
+	}
+	return false;
+}
+
+/*
+ * Free an object into the given bin.  Fails only if the bin is full.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
+	if (unlikely(cache_bin_full(bin))) {
+		return false;
+	}
+
+	if (unlikely(cache_bin_dalloc_safety_checks(bin, ptr))) {
+		return true;
+	}
+
+	bin->stack_head--;
+	*bin->stack_head = ptr;
+	cache_bin_assert_earlier(bin, bin->low_bits_full,
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head);
+
+	return true;
+}
+
+/* Returns false if failed to stash (i.e. bin is full). */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_stash(cache_bin_t *bin, void *ptr) {
+	if (cache_bin_full(bin)) {
+		return false;
+	}
+
+	/* Stash at the full position, in the [full, head) range. */
+	cache_bin_sz_t low_bits_head = (cache_bin_sz_t)(uintptr_t)
+	                                   bin->stack_head;
+	/* Wraparound handled as well. */
+	cache_bin_sz_t diff = cache_bin_diff(
+	    bin, bin->low_bits_full, low_bits_head);
+	*(void **)((byte_t *)bin->stack_head - diff) = ptr;
+
+	assert(!cache_bin_full(bin));
+	bin->low_bits_full += sizeof(void *);
+	cache_bin_assert_earlier(bin, bin->low_bits_full, low_bits_head);
+
+	return true;
+}
+
+/* Get the number of stashed pointers. */
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_nstashed_get_internal(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
+	cache_bin_sz_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(
+	    bin);
+
+	cache_bin_sz_t n = cache_bin_diff(
+	                       bin, low_bits_low_bound, bin->low_bits_full)
+	    / sizeof(void *);
+	assert(n <= ncached_max);
+	if (config_debug && n != 0) {
+		/* Below are for assertions only. */
+		void **low_bound = cache_bin_low_bound_get(bin);
+
+		assert(
+		    (cache_bin_sz_t)(uintptr_t)low_bound == low_bits_low_bound);
+		void *stashed = *(low_bound + n - 1);
+		bool  aligned = cache_bin_nonfast_aligned(stashed);
+#ifdef JEMALLOC_JET
+		/* Allow arbitrary pointers to be stashed in tests. */
+		aligned = true;
+#endif
+		assert(stashed != NULL && aligned);
+	}
+
+	return n;
+}
+
+JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
+cache_bin_nstashed_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
+	return n;
+}
+
+/*
+ * Obtain a racy view of the number of items currently in the cache bin, in the
+ * presence of possible concurrent modifications.
+ *
+ * Note that this is the only racy function in this header.  Any other functions
+ * are assumed to be non-racy.  The "racy" term here means accessed from another
+ * thread (that is not the owner of the specific cache bin).  This only happens
+ * when gathering stats (read-only).  The only change because of the racy
+ * condition is that assertions based on mutable fields are omitted.
+ *
+ * It's important to keep in mind that 'bin->stack_head' and
+ * 'bin->low_bits_full' can be modified concurrently and almost no assertions
+ * about their values can be made.
+ *
+ * This function should not call other utility functions because the racy
+ * condition may cause unexpected / undefined behaviors in unverified utility
+ * functions.  Currently, this function calls two utility functions
+ * cache_bin_ncached_max_get and cache_bin_low_bits_low_bound_get because
+ * they help access values that will not be concurrently modified.
+ */
+static inline void
+cache_bin_nitems_get_remote(
+    cache_bin_t *bin, cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) {
+	/* Racy version of cache_bin_ncached_get_internal. */
+	cache_bin_sz_t diff = bin->low_bits_empty
+	    - (cache_bin_sz_t)(uintptr_t)bin->stack_head;
+	cache_bin_sz_t n = diff / sizeof(void *);
+	*ncached = n;
+
+	/* Racy version of cache_bin_nstashed_get_internal. */
+	cache_bin_sz_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(
+	    bin);
+	n = (bin->low_bits_full - low_bits_low_bound) / sizeof(void *);
+	*nstashed = n;
+	/*
+	 * Note that cannot assert anything regarding ncached_max because
+	 * it can be configured on the fly and is thus racy.
+	 */
+}
+
+/*
+ * For small bins, used to calculate how many items to fill at a time.
+ * The final nfill is calculated by (ncached_max >> (base - offset)).
+ */
+typedef struct cache_bin_fill_ctl_s cache_bin_fill_ctl_t;
+struct cache_bin_fill_ctl_s {
+	uint8_t base;
+	uint8_t offset;
+};
+
+/*
+ * Limit how many items can be flushed in a batch (Which is the upper bound
+ * for the nflush parameter in tcache_bin_flush_impl()).
+ * This is to avoid stack overflow when we do batch edata look up, which
+ * reserves a nflush * sizeof(emap_batch_lookup_result_t) stack variable.
+ */
+#define CACHE_BIN_NFLUSH_BATCH_MAX                                             \
+	((VARIABLE_ARRAY_SIZE_MAX >> LG_SIZEOF_PTR) - 1)
+
+/*
+ * Filling and flushing are done in batch, on arrays of void *s.  For filling,
+ * the arrays go forward, and can be accessed with ordinary array arithmetic.
+ * For flushing, we work from the end backwards, and so need to use special
+ * accessors that invert the usual ordering.
+ *
+ * This is important for maintaining first-fit; the arena code fills with
+ * earliest objects first, and so those are the ones we should return first for
+ * cache_bin_alloc calls.  When flushing, we should flush the objects that we
+ * wish to return later; those at the end of the array.  This is better for the
+ * first-fit heuristic as well as for cache locality; the most recently freed
+ * objects are the ones most likely to still be in cache.
+ *
+ * This all sounds very hand-wavey and theoretical, but reverting the ordering
+ * on one or the other pathway leads to measurable slowdowns.
+ */
+
+typedef struct cache_bin_ptr_array_s cache_bin_ptr_array_t;
+struct cache_bin_ptr_array_s {
+	cache_bin_sz_t n;
+	void         **ptr;
+};
+
+/*
+ * Declare a cache_bin_ptr_array_t sufficient for nval items.
+ *
+ * In the current implementation, this could be just part of a
+ * cache_bin_ptr_array_init_... call, since we reuse the cache bin stack memory.
+ * Indirecting behind a macro, though, means experimenting with linked-list
+ * representations is easy (since they'll require an alloca in the calling
+ * frame).
+ */
+#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval)                                \
+	cache_bin_ptr_array_t name;                                            \
+	name.n = (nval)
+
+/*
+ * Start a fill.  The bin must be empty, and This must be followed by a
+ * finish_fill call before doing any alloc/dalloc operations on the bin.
+ */
+static inline void
+cache_bin_init_ptr_array_for_fill(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
+	cache_bin_assert_empty(bin);
+	arr->ptr = cache_bin_empty_position_get(bin) - nfill;
+}
+
+/*
+ * While nfill in cache_bin_init_ptr_array_for_fill is the number we *intend* to
+ * fill, nfilled here is the number we actually filled (which may be less, in
+ * case of OOM.
+ */
+static inline void
+cache_bin_finish_fill(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
+	cache_bin_assert_empty(bin);
+	void **empty_position = cache_bin_empty_position_get(bin);
+	if (nfilled < arr->n) {
+		memmove(empty_position - nfilled, empty_position - arr->n,
+		    nfilled * sizeof(void *));
+	}
+	bin->stack_head = empty_position - nfilled;
+	/* Reset the bin stats as it's merged during fill. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
+}
+
+/*
+ * Same deal, but with flush.  Unlike fill (which can fail), the user must flush
+ * everything we give them.
+ */
+static inline void
+cache_bin_init_ptr_array_for_flush(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) {
+	arr->ptr = cache_bin_empty_position_get(bin) - nflush;
+	assert(cache_bin_ncached_get_local(bin) == 0 || *arr->ptr != NULL);
+}
+
+static inline void
+cache_bin_finish_flush(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
+	unsigned rem = cache_bin_ncached_get_local(bin) - nflushed;
+	memmove(
+	    bin->stack_head + nflushed, bin->stack_head, rem * sizeof(void *));
+	bin->stack_head += nflushed;
+	cache_bin_low_water_adjust(bin);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
+}
+
+static inline void
+cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind,
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nstashed) {
+	assert(nstashed > 0);
+	assert(cache_bin_nstashed_get_local(bin) == nstashed);
+
+	void **low_bound = cache_bin_low_bound_get(bin);
+	arr->ptr = low_bound;
+	assert(*arr->ptr != NULL);
+}
+
+static inline void
+cache_bin_finish_flush_stashed(cache_bin_t *bin) {
+	void **low_bound = cache_bin_low_bound_get(bin);
+
+	/* Reset the bin local full position. */
+	bin->low_bits_full = (uint16_t)(uintptr_t)low_bound;
+	assert(cache_bin_nstashed_get_local(bin) == 0);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
+}
+
+/*
+ * Initialize a cache_bin_info to represent up to the given number of items in
+ * the cache_bins it is associated with.
+ */
+void cache_bin_info_init(
+    cache_bin_info_t *bin_info, cache_bin_sz_t ncached_max);
+/*
+ * Given an array of initialized cache_bin_info_ts, determine how big an
+ * allocation is required to initialize a full set of cache_bin_ts.
+ */
+void cache_bin_info_compute_alloc(const cache_bin_info_t *infos, szind_t ninfos,
+    size_t *size, size_t *alignment);
+
+/*
+ * Actually initialize some cache bins.  Callers should allocate the backing
+ * memory indicated by a call to cache_bin_compute_alloc.  They should then
+ * preincrement, call init once for each bin and info, and then call
+ * cache_bin_postincrement.  *alloc_cur will then point immediately past the end
+ * of the allocation.
+ */
+void cache_bin_preincrement(const cache_bin_info_t *infos, szind_t ninfos,
+    void *alloc, size_t *cur_offset);
+void cache_bin_postincrement(void *alloc, size_t *cur_offset);
+void cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info, void *alloc,
+    size_t *cur_offset);
+void cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max);
+
+bool cache_bin_stack_use_thp(void);
+
+#endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@ -1,96 +0,0 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
-
-/*
- * Size and alignment of memory chunks that are allocated by the OS's virtual
- * memory system.
- */
-#define	LG_CHUNK_DEFAULT	21
-
-/* Return the chunk address for allocation address a. */
-#define	CHUNK_ADDR2BASE(a)						\
-	((void *)((uintptr_t)(a) & ~chunksize_mask))
-
-/* Return the chunk offset of address a. */
-#define	CHUNK_ADDR2OFFSET(a)						\
-	((size_t)((uintptr_t)(a) & chunksize_mask))
-
-/* Return the smallest chunk multiple that is >= s. */
-#define	CHUNK_CEILING(s)						\
-	(((s) + chunksize_mask) & ~chunksize_mask)
-
-#define	CHUNK_HOOKS_INITIALIZER {					\
-    NULL,								\
-    NULL,								\
-    NULL,								\
-    NULL,								\
-    NULL,								\
-    NULL,								\
-    NULL								\
-}
-
-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-extern size_t		opt_lg_chunk;
-extern const char	*opt_dss;
-
-extern rtree_t		chunks_rtree;
-
-extern size_t		chunksize;
-extern size_t		chunksize_mask; /* (chunksize - 1). */
-extern size_t		chunk_npages;
-
-extern const chunk_hooks_t	chunk_hooks_default;
-
-chunk_hooks_t	chunk_hooks_get(tsdn_t *tsdn, arena_t *arena);
-chunk_hooks_t	chunk_hooks_set(tsdn_t *tsdn, arena_t *arena,
-    const chunk_hooks_t *chunk_hooks);
-
-bool	chunk_register(tsdn_t *tsdn, const void *chunk,
-    const extent_node_t *node);
-void	chunk_deregister(const void *chunk, const extent_node_t *node);
-void	*chunk_alloc_base(size_t size);
-void	*chunk_alloc_cache(tsdn_t *tsdn, arena_t *arena,
-    chunk_hooks_t *chunk_hooks, void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool *commit, bool dalloc_node);
-void	*chunk_alloc_wrapper(tsdn_t *tsdn, arena_t *arena,
-    chunk_hooks_t *chunk_hooks, void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool *commit);
-void	chunk_dalloc_cache(tsdn_t *tsdn, arena_t *arena,
-    chunk_hooks_t *chunk_hooks, void *chunk, size_t size, bool committed);
-void	chunk_dalloc_wrapper(tsdn_t *tsdn, arena_t *arena,
-    chunk_hooks_t *chunk_hooks, void *chunk, size_t size, bool zeroed,
-    bool committed);
-bool	chunk_purge_wrapper(tsdn_t *tsdn, arena_t *arena,
-    chunk_hooks_t *chunk_hooks, void *chunk, size_t size, size_t offset,
-    size_t length);
-bool	chunk_boot(void);
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#ifndef JEMALLOC_ENABLE_INLINE
-extent_node_t	*chunk_lookup(const void *chunk, bool dependent);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_CHUNK_C_))
-JEMALLOC_INLINE extent_node_t *
-chunk_lookup(const void *ptr, bool dependent)
-{
-
-	return (rtree_get(&chunks_rtree, (uintptr_t)ptr, dependent));
-}
-#endif
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
-
-#include "jemalloc/internal/chunk_dss.h"
-#include "jemalloc/internal/chunk_mmap.h"
--- a/include/jemalloc/internal/chunk_dss.h
+++ b/include/jemalloc/internal/chunk_dss.h
@ -1,37 +0,0 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
-
-typedef enum {
-	dss_prec_disabled  = 0,
-	dss_prec_primary   = 1,
-	dss_prec_secondary = 2,
-
-	dss_prec_limit     = 3
-} dss_prec_t;
-#define	DSS_PREC_DEFAULT	dss_prec_secondary
-#define	DSS_DEFAULT		"secondary"
-
-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-extern const char *dss_prec_names[];
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-dss_prec_t	chunk_dss_prec_get(void);
-bool	chunk_dss_prec_set(dss_prec_t dss_prec);
-void	*chunk_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
-    size_t size, size_t alignment, bool *zero, bool *commit);
-bool	chunk_in_dss(void *chunk);
-bool	chunk_dss_mergeable(void *chunk_a, void *chunk_b);
-void	chunk_dss_boot(void);
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
--- a/include/jemalloc/internal/chunk_mmap.h
+++ b/include/jemalloc/internal/chunk_mmap.h
@ -1,21 +0,0 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
-
-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-void	*chunk_alloc_mmap(void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool *commit);
-bool	chunk_dalloc_mmap(void *chunk, size_t size);
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@ -1,86 +1,102 @@
+#ifndef JEMALLOC_INTERNAL_CKH_H
+#define JEMALLOC_INTERNAL_CKH_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd.h"
+
+/* Cuckoo hashing implementation.  Skip to the end for the interface. */
+
+/******************************************************************************/
+/* INTERNAL DEFINITIONS -- IGNORE */
 /******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
-
-typedef struct ckh_s ckh_t;
-typedef struct ckhc_s ckhc_t;
-
-/* Typedefs to allow easy function pointer passing. */
-typedef void ckh_hash_t (const void *, size_t[2]);
-typedef bool ckh_keycomp_t (const void *, const void *);

 /* Maintain counters used to get an idea of performance. */
-/* #define	CKH_COUNT */
+/* #define CKH_COUNT */
 /* Print counter values in ckh_delete() (requires CKH_COUNT). */
-/* #define	CKH_VERBOSE */
+/* #define CKH_VERBOSE */

 /*
 * There are 2^LG_CKH_BUCKET_CELLS cells in each hash table bucket.  Try to fit
 * one bucket per L1 cache line.
 */
-#define	LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)
+#define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+/* Typedefs to allow easy function pointer passing. */
+typedef void ckh_hash_t(const void *, size_t[2]);
+typedef bool ckh_keycomp_t(const void *, const void *);

 /* Hash table cell. */
-struct ckhc_s {
-	const void	*key;
-	const void	*data;
-};
+typedef struct {
+	const void *key;
+	const void *data;
+} ckhc_t;

-struct ckh_s {
+/* The hash table itself. */
+typedef struct {
 #ifdef CKH_COUNT
 	/* Counters used to get an idea of performance. */
-	uint64_t	ngrows;
-	uint64_t	nshrinks;
-	uint64_t	nshrinkfails;
-	uint64_t	ninserts;
-	uint64_t	nrelocs;
+	uint64_t ngrows;
+	uint64_t nshrinks;
+	uint64_t nshrinkfails;
+	uint64_t ninserts;
+	uint64_t nrelocs;
 #endif

 	/* Used for pseudo-random number generation. */
-	uint64_t	prng_state;
+	uint64_t prng_state;

 	/* Total number of items. */
-	size_t		count;
+	size_t count;

 	/*
 	 * Minimum and current number of hash table buckets.  There are
 	 * 2^LG_CKH_BUCKET_CELLS cells per bucket.
 	 */
-	unsigned	lg_minbuckets;
-	unsigned	lg_curbuckets;
+	unsigned lg_minbuckets;
+	unsigned lg_curbuckets;

 	/* Hash and comparison functions. */
-	ckh_hash_t	*hash;
-	ckh_keycomp_t	*keycomp;
+	ckh_hash_t    *hash;
+	ckh_keycomp_t *keycomp;

 	/* Hash table with 2^lg_curbuckets buckets. */
-	ckhc_t		*tab;
-};
+	ckhc_t *tab;
+} ckh_t;

-#endif /* JEMALLOC_H_STRUCTS */
 /******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
+/* BEGIN PUBLIC API */
+/******************************************************************************/

-bool	ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
+/* Lifetime management.  Minitems is the initial capacity. */
+bool ckh_new(tsd_t *tsd, ckh_t *ckh, size_t minitems, ckh_hash_t *hash,
    ckh_keycomp_t *keycomp);
-void	ckh_delete(tsd_t *tsd, ckh_t *ckh);
-size_t	ckh_count(ckh_t *ckh);
-bool	ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
-bool	ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
-bool	ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
-    void **data);
-bool	ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data);
-void	ckh_string_hash(const void *key, size_t r_hash[2]);
-bool	ckh_string_keycomp(const void *k1, const void *k2);
-void	ckh_pointer_hash(const void *key, size_t r_hash[2]);
-bool	ckh_pointer_keycomp(const void *k1, const void *k2);
+void ckh_delete(tsd_t *tsd, ckh_t *ckh);

-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
+/* Get the number of elements in the set. */
+size_t ckh_count(ckh_t *ckh);

-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+/*
+ * To iterate over the elements in the table, initialize *tabind to 0 and call
+ * this function until it returns true.  Each call that returns false will
+ * update *key and *data to the next element in the table, assuming the pointers
+ * are non-NULL.
+ */
+bool ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
+
+/*
+ * Basic hash table operations -- insert, removal, lookup.  For ckh_remove and
+ * ckh_search, key or data can be NULL.  The hash-table only stores pointers to
+ * the key and value, and doesn't do any lifetime management.
+ */
+bool ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
+bool ckh_remove(
+    tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, void **data);
+bool ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data);
+
+/* Some useful hash and comparison functions for strings and pointers. */
+void ckh_string_hash(const void *key, size_t r_hash[2]);
+bool ckh_string_keycomp(const void *k1, const void *k2);
+void ckh_pointer_hash(const void *key, size_t r_hash[2]);
+bool ckh_pointer_keycomp(const void *k1, const void *k2);
+
+#endif /* JEMALLOC_INTERNAL_CKH_H */
--- a/include/jemalloc/internal/conf.h
+++ b/include/jemalloc/internal/conf.h
@ -0,0 +1,23 @@
+#ifndef JEMALLOC_INTERNAL_CONF_H
+#define JEMALLOC_INTERNAL_CONF_H
+
+#include "jemalloc/internal/sc.h"
+
+void malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    char readlink_buf[PATH_MAX + 1]);
+void malloc_abort_invalid_conf(void);
+
+#ifdef JEMALLOC_JET
+extern bool had_conf_error;
+
+bool conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
+    char const **v_p, size_t *vlen_p);
+void conf_error(
+    const char *msg, const char *k, size_t klen, const char *v, size_t vlen);
+bool conf_handle_bool(const char *v, size_t vlen, bool *result);
+bool conf_handle_signed(const char *v, size_t vlen, intmax_t min, intmax_t max,
+    bool check_min, bool check_max, bool clip, intmax_t *result);
+bool conf_handle_char_p(const char *v, size_t vlen, char *dest, size_t dest_sz);
+#endif
+
+#endif /* JEMALLOC_INTERNAL_CONF_H */
--- a/include/jemalloc/internal/counter.h
+++ b/include/jemalloc/internal/counter.h
@ -0,0 +1,36 @@
+#ifndef JEMALLOC_INTERNAL_COUNTER_H
+#define JEMALLOC_INTERNAL_COUNTER_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/lockedint.h"
+#include "jemalloc/internal/mutex.h"
+
+typedef struct counter_accum_s {
+	LOCKEDINT_MTX_DECLARE(mtx)
+	locked_u64_t accumbytes;
+	uint64_t     interval;
+} counter_accum_t;
+
+JEMALLOC_ALWAYS_INLINE bool
+counter_accum(tsdn_t *tsdn, counter_accum_t *counter, uint64_t bytes) {
+	uint64_t interval = counter->interval;
+	assert(interval > 0);
+	LOCKEDINT_MTX_LOCK(tsdn, counter->mtx);
+	/*
+	 * If the event moves fast enough (and/or if the event handling is slow
+	 * enough), extreme overflow can cause counter trigger coalescing.
+	 * This is an intentional mechanism that avoids rate-limiting
+	 * allocation.
+	 */
+	bool overflow = locked_inc_mod_u64(tsdn, LOCKEDINT_MTX(counter->mtx),
+	    &counter->accumbytes, bytes, interval);
+	LOCKEDINT_MTX_UNLOCK(tsdn, counter->mtx);
+	return overflow;
+}
+
+bool counter_accum_init(counter_accum_t *counter, uint64_t interval);
+void counter_prefork(tsdn_t *tsdn, counter_accum_t *counter);
+void counter_postfork_parent(tsdn_t *tsdn, counter_accum_t *counter);
+void counter_postfork_child(tsdn_t *tsdn, counter_accum_t *counter);
+
+#endif /* JEMALLOC_INTERNAL_COUNTER_H */
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@ -1,118 +1,172 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
+#ifndef JEMALLOC_INTERNAL_CTL_H
+#define JEMALLOC_INTERNAL_CTL_H

-typedef struct ctl_node_s ctl_node_t;
-typedef struct ctl_named_node_s ctl_named_node_t;
-typedef struct ctl_indexed_node_s ctl_indexed_node_t;
-typedef struct ctl_arena_stats_s ctl_arena_stats_t;
-typedef struct ctl_stats_s ctl_stats_t;
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/bin_stats.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/mutex_prof.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/stats.h"

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+/* Maximum ctl tree depth. */
+#define CTL_MAX_DEPTH 7
+#define CTL_MULTI_SETTING_MAX_LEN 1000

-struct ctl_node_s {
-	bool			named;
-};
+typedef struct ctl_node_s {
+	bool named;
+} ctl_node_t;

-struct ctl_named_node_s {
-	struct ctl_node_s	node;
-	const char		*name;
+typedef struct ctl_named_node_s {
+	ctl_node_t  node;
+	const char *name;
 	/* If (nchildren == 0), this is a terminal node. */
-	unsigned		nchildren;
-	const			ctl_node_t *children;
-	int			(*ctl)(tsd_t *, const size_t *, size_t, void *,
-	    size_t *, void *, size_t);
-};
+	size_t            nchildren;
+	const ctl_node_t *children;
+	int (*ctl)(
+	    tsd_t *, const size_t *, size_t, void *, size_t *, void *, size_t);
+} ctl_named_node_t;

-struct ctl_indexed_node_s {
-	struct ctl_node_s	node;
-	const ctl_named_node_t	*(*index)(tsdn_t *, const size_t *, size_t,
-	    size_t);
-};
+typedef struct ctl_indexed_node_s {
+	struct ctl_node_s node;
+	const ctl_named_node_t *(*index)(
+	    tsdn_t *, const size_t *, size_t, size_t);
+} ctl_indexed_node_t;

-struct ctl_arena_stats_s {
-	bool			initialized;
-	unsigned		nthreads;
-	const char		*dss;
-	ssize_t			lg_dirty_mult;
-	ssize_t			decay_time;
-	size_t			pactive;
-	size_t			pdirty;
-
-	/* The remainder are only populated if config_stats is true. */
-
-	arena_stats_t		astats;
+typedef struct ctl_arena_stats_s {
+	arena_stats_t astats;

 	/* Aggregate stats for small size classes, based on bin stats. */
-	size_t			allocated_small;
-	uint64_t		nmalloc_small;
-	uint64_t		ndalloc_small;
-	uint64_t		nrequests_small;
+	size_t   allocated_small;
+	uint64_t nmalloc_small;
+	uint64_t ndalloc_small;
+	uint64_t nrequests_small;
+	uint64_t nfills_small;
+	uint64_t nflushes_small;

-	malloc_bin_stats_t	bstats[NBINS];
-	malloc_large_stats_t	*lstats;	/* nlclasses elements. */
-	malloc_huge_stats_t	*hstats;	/* nhclasses elements. */
+	bin_stats_data_t    bstats[SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
+	pac_estats_t        estats[SC_NPSIZES];
+	hpa_shard_stats_t   hpastats;
+} ctl_arena_stats_t;
+
+typedef struct ctl_stats_s {
+	size_t allocated;
+	size_t active;
+	size_t metadata;
+	size_t metadata_edata;
+	size_t metadata_rtree;
+	size_t metadata_thp;
+	size_t resident;
+	size_t mapped;
+	size_t retained;
+
+	background_thread_stats_t background_thread;
+	mutex_prof_data_t mutex_prof_data[mutex_prof_num_global_mutexes];
+} ctl_stats_t;
+
+typedef struct ctl_arena_s ctl_arena_t;
+struct ctl_arena_s {
+	unsigned arena_ind;
+	bool     initialized;
+	ql_elm(ctl_arena_t) destroyed_link;
+
+	/* Basic stats, supported even if !config_stats. */
+	unsigned    nthreads;
+	const char *dss;
+	ssize_t     dirty_decay_ms;
+	ssize_t     muzzy_decay_ms;
+	size_t      pactive;
+	size_t      pdirty;
+	size_t      pmuzzy;
+
+	/* NULL if !config_stats. */
+	ctl_arena_stats_t *astats;
 };

-struct ctl_stats_s {
-	size_t			allocated;
-	size_t			active;
-	size_t			metadata;
-	size_t			resident;
-	size_t			mapped;
-	size_t			retained;
-	unsigned		narenas;
-	ctl_arena_stats_t	*arenas;	/* (narenas + 1) elements. */
-};
+typedef struct ctl_arenas_s {
+	uint64_t epoch;
+	unsigned narenas;
+	ql_head(ctl_arena_t) destroyed;

-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
+	/*
+	 * Element 0 corresponds to merged stats for extant arenas (accessed via
+	 * MALLCTL_ARENAS_ALL), element 1 corresponds to merged stats for
+	 * destroyed arenas (accessed via MALLCTL_ARENAS_DESTROYED), and the
+	 * remaining MALLOCX_ARENA_LIMIT elements correspond to arenas.
+	 */
+	ctl_arena_t *arenas[2 + MALLOCX_ARENA_LIMIT];
+} ctl_arenas_t;

-int	ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp,
+int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp,
    void *newp, size_t newlen);
-int	ctl_nametomib(tsdn_t *tsdn, const char *name, size_t *mibp,
-    size_t *miblenp);
-
-int	ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
+int ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp);
+int ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
    size_t *oldlenp, void *newp, size_t newlen);
-bool	ctl_boot(void);
-void	ctl_prefork(tsdn_t *tsdn);
-void	ctl_postfork_parent(tsdn_t *tsdn);
-void	ctl_postfork_child(tsdn_t *tsdn);
+int ctl_mibnametomib(
+    tsd_t *tsd, size_t *mib, size_t miblen, const char *name, size_t *miblenp);
+int  ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
+     size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+bool ctl_boot(void);
+void ctl_prefork(tsdn_t *tsdn);
+void ctl_postfork_parent(tsdn_t *tsdn);
+void ctl_postfork_child(tsdn_t *tsdn);
+void ctl_mtx_assert_held(tsdn_t *tsdn);

-#define	xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
-	if (je_mallctl(name, oldp, oldlenp, newp, newlen)		\
-	    != 0) {							\
-		malloc_printf(						\
-		    "<jemalloc>: Failure in xmallctl(\"%s\", ...)\n",	\
-		    name);						\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctl(name, oldp, oldlenp, newp, newlen)                            \
+	do {                                                                   \
+		if (je_mallctl(name, oldp, oldlenp, newp, newlen) != 0) {      \
+			malloc_printf(                                         \
+			    "<jemalloc>: Failure in xmallctl(\"%s\", ...)\n",  \
+			    name);                                             \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define	xmallctlnametomib(name, mibp, miblenp) do {			\
-	if (je_mallctlnametomib(name, mibp, miblenp) != 0) {		\
-		malloc_printf("<jemalloc>: Failure in "			\
-		    "xmallctlnametomib(\"%s\", ...)\n", name);		\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlnametomib(name, mibp, miblenp)                                 \
+	do {                                                                   \
+		if (je_mallctlnametomib(name, mibp, miblenp) != 0) {           \
+			malloc_printf(                                         \
+			    "<jemalloc>: Failure in "                          \
+			    "xmallctlnametomib(\"%s\", ...)\n",                \
+			    name);                                             \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define	xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen) do {	\
-	if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp,		\
-	    newlen) != 0) {						\
-		malloc_write(						\
-		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen)                \
+	do {                                                                   \
+		if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen)  \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in xmallctlbymib()\n");       \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
+#define xmallctlmibnametomib(mib, miblen, name, miblenp)                       \
+	do {                                                                   \
+		if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp)  \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in ctl_mibnametomib()\n");    \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+#define xmallctlbymibname(                                                     \
+    mib, miblen, name, miblenp, oldp, oldlenp, newp, newlen)                   \
+	do {                                                                   \
+		if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp,     \
+		        oldp, oldlenp, newp, newlen)                           \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in ctl_bymibname()\n");       \
+			abort();                                               \
+		}                                                              \
+	} while (0)

+#endif /* JEMALLOC_INTERNAL_CTL_H */
--- a/include/jemalloc/internal/decay.h
+++ b/include/jemalloc/internal/decay.h
@ -0,0 +1,188 @@
+#ifndef JEMALLOC_INTERNAL_DECAY_H
+#define JEMALLOC_INTERNAL_DECAY_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/smoothstep.h"
+
+#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t) - 1)
+
+/*
+ * The decay_t computes the number of pages we should purge at any given time.
+ * Page allocators inform a decay object when pages enter a decay-able state
+ * (i.e. dirty or muzzy), and query it to determine how many pages should be
+ * purged at any given time.
+ *
+ * This is mostly a single-threaded data structure and doesn't care about
+ * synchronization at all; it's the caller's responsibility to manage their
+ * synchronization on their own.  There are two exceptions:
+ * 1) It's OK to racily call decay_ms_read (i.e. just the simplest state query).
+ * 2) The mtx and purging fields live (and are initialized) here, but are
+ *    logically owned by the page allocator.  This is just a convenience (since
+ *    those fields would be duplicated for both the dirty and muzzy states
+ *    otherwise).
+ */
+typedef struct decay_s decay_t;
+struct decay_s {
+	/* Synchronizes all non-atomic fields. */
+	malloc_mutex_t mtx;
+	/*
+	 * True if a thread is currently purging the extents associated with
+	 * this decay structure.
+	 */
+	bool purging;
+	/*
+	 * Approximate time in milliseconds from the creation of a set of unused
+	 * dirty pages until an equivalent set of unused dirty pages is purged
+	 * and/or reused.
+	 */
+	atomic_zd_t time_ms;
+	/* time / SMOOTHSTEP_NSTEPS. */
+	nstime_t interval;
+	/*
+	 * Time at which the current decay interval logically started.  We do
+	 * not actually advance to a new epoch until sometime after it starts
+	 * because of scheduling and computation delays, and it is even possible
+	 * to completely skip epochs.  In all cases, during epoch advancement we
+	 * merge all relevant activity into the most recently recorded epoch.
+	 */
+	nstime_t epoch;
+	/* Deadline randomness generator. */
+	uint64_t jitter_state;
+	/*
+	 * Deadline for current epoch.  This is the sum of interval and per
+	 * epoch jitter which is a uniform random variable in [0..interval).
+	 * Epochs always advance by precise multiples of interval, but we
+	 * randomize the deadline to reduce the likelihood of arenas purging in
+	 * lockstep.
+	 */
+	nstime_t deadline;
+	/*
+	 * The number of pages we cap ourselves at in the current epoch, per
+	 * decay policies.  Updated on an epoch change.  After an epoch change,
+	 * the caller should take steps to try to purge down to this amount.
+	 */
+	size_t npages_limit;
+	/*
+	 * Number of unpurged pages at beginning of current epoch.  During epoch
+	 * advancement we use the delta between arena->decay_*.nunpurged and
+	 * ecache_npages_get(&arena->ecache_*) to determine how many dirty pages,
+	 * if any, were generated.
+	 */
+	size_t nunpurged;
+	/*
+	 * Trailing log of how many unused dirty pages were generated during
+	 * each of the past SMOOTHSTEP_NSTEPS decay epochs, where the last
+	 * element is the most recent epoch.  Corresponding epoch times are
+	 * relative to epoch.
+	 *
+	 * Updated only on epoch advance, triggered by
+	 * decay_maybe_advance_epoch, below.
+	 */
+	size_t backlog[SMOOTHSTEP_NSTEPS];
+
+	/* Peak number of pages in associated extents.  Used for debug only. */
+	uint64_t ceil_npages;
+};
+
+/*
+ * The current decay time setting.  This is the only public access to a decay_t
+ * that's allowed without holding mtx.
+ */
+static inline ssize_t
+decay_ms_read(const decay_t *decay) {
+	return atomic_load_zd(&decay->time_ms, ATOMIC_RELAXED);
+}
+
+/*
+ * See the comment on the struct field -- the limit on pages we should allow in
+ * this decay state this epoch.
+ */
+static inline size_t
+decay_npages_limit_get(const decay_t *decay) {
+	return decay->npages_limit;
+}
+
+/* How many unused dirty pages were generated during the last epoch. */
+static inline size_t
+decay_epoch_npages_delta(const decay_t *decay) {
+	return decay->backlog[SMOOTHSTEP_NSTEPS - 1];
+}
+
+/*
+ * Current epoch duration, in nanoseconds.  Given that new epochs are started
+ * somewhat haphazardly, this is not necessarily exactly the time between any
+ * two calls to decay_maybe_advance_epoch; see the comments on fields in the
+ * decay_t.
+ */
+static inline uint64_t
+decay_epoch_duration_ns(const decay_t *decay) {
+	return nstime_ns(&decay->interval);
+}
+
+static inline bool
+decay_immediately(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms == 0;
+}
+
+static inline bool
+decay_disabled(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms < 0;
+}
+
+/* Returns true if decay is enabled and done gradually. */
+static inline bool
+decay_gradually(const decay_t *decay) {
+	ssize_t decay_ms = decay_ms_read(decay);
+	return decay_ms > 0;
+}
+
+/*
+ * Returns true if the passed in decay time setting is valid.
+ * < -1 : invalid
+ * -1   : never decay
+ *  0   : decay immediately
+ *  > 0 : some positive decay time, up to a maximum allowed value of
+ *  NSTIME_SEC_MAX * 1000, which corresponds to decaying somewhere in the early
+ *  27th century.  By that time, we expect to have implemented alternate purging
+ *  strategies.
+ */
+bool decay_ms_valid(ssize_t decay_ms);
+
+/*
+ * As a precondition, the decay_t must be zeroed out (as if with memset).
+ *
+ * Returns true on error.
+ */
+bool decay_init(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
+
+/*
+ * Given an already-initialized decay_t, reinitialize it with the given decay
+ * time.  The decay_t must have previously been initialized (and should not then
+ * be zeroed).
+ */
+void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
+
+/*
+ * Compute how many of 'npages_new' pages we would need to purge in 'time'.
+ */
+uint64_t decay_npages_purge_in(
+    decay_t *decay, nstime_t *time, size_t npages_new);
+
+/* Returns true if the epoch advanced and there are pages to purge. */
+bool decay_maybe_advance_epoch(
+    decay_t *decay, nstime_t *new_time, size_t current_npages);
+
+/*
+ * Calculates wait time until a number of pages in the interval
+ * [0.5 * npages_threshold .. 1.5 * npages_threshold] should be purged.
+ *
+ * Returns number of nanoseconds or DECAY_UNBOUNDED_TIME_TO_PURGE in case of
+ * indefinite wait.
+ */
+uint64_t decay_ns_until_purge(
+    decay_t *decay, size_t npages_current, uint64_t npages_threshold);
+
+#endif /* JEMALLOC_INTERNAL_DECAY_H */
--- a/include/jemalloc/internal/div.h
+++ b/include/jemalloc/internal/div.h
@ -0,0 +1,42 @@
+#ifndef JEMALLOC_INTERNAL_DIV_H
+#define JEMALLOC_INTERNAL_DIV_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
+/*
+ * This module does the division that computes the index of a region in a slab,
+ * given its offset relative to the base.
+ * That is, given a divisor d, an n = i * d (all integers), we'll return i.
+ * We do some pre-computation to do this more quickly than a CPU division
+ * instruction.
+ * We bound n < 2^32, and don't support dividing by one.
+ */
+
+typedef struct div_info_s div_info_t;
+struct div_info_s {
+	uint32_t magic;
+#ifdef JEMALLOC_DEBUG
+	size_t d;
+#endif
+};
+
+void div_init(div_info_t *div_info, size_t divisor);
+
+static inline size_t
+div_compute(div_info_t *div_info, size_t n) {
+	assert(n <= (uint32_t)-1);
+	/*
+	 * This generates, e.g. mov; imul; shr on x86-64. On a 32-bit machine,
+	 * the compilers I tried were all smart enough to turn this into the
+	 * appropriate "get the high 32 bits of the result of a multiply" (e.g.
+	 * mul; mov edx eax; on x86, umull on arm, etc.).
+	 */
+	size_t i = ((uint64_t)n * (uint64_t)div_info->magic) >> 32;
+#ifdef JEMALLOC_DEBUG
+	assert(i * div_info->d == n);
+#endif
+	return i;
+}
+
+#endif /* JEMALLOC_INTERNAL_DIV_H */
--- a/include/jemalloc/internal/ecache.h
+++ b/include/jemalloc/internal/ecache.h
@ -0,0 +1,56 @@
+#ifndef JEMALLOC_INTERNAL_ECACHE_H
+#define JEMALLOC_INTERNAL_ECACHE_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/eset.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/san.h"
+
+typedef struct ecache_s ecache_t;
+struct ecache_s {
+	malloc_mutex_t mtx;
+	eset_t         eset;
+	eset_t         guarded_eset;
+	/* All stored extents must be in the same state. */
+	extent_state_t state;
+	/* The index of the ehooks the ecache is associated with. */
+	unsigned ind;
+	/*
+	 * If true, delay coalescing until eviction; otherwise coalesce during
+	 * deallocation.
+	 */
+	bool delay_coalesce;
+};
+
+static inline size_t
+ecache_npages_get(ecache_t *ecache) {
+	return eset_npages_get(&ecache->eset)
+	    + eset_npages_get(&ecache->guarded_eset);
+}
+
+/* Get the number of extents in the given page size index. */
+static inline size_t
+ecache_nextents_get(ecache_t *ecache, pszind_t ind) {
+	return eset_nextents_get(&ecache->eset, ind)
+	    + eset_nextents_get(&ecache->guarded_eset, ind);
+}
+
+/* Get the sum total bytes of the extents in the given page size index. */
+static inline size_t
+ecache_nbytes_get(ecache_t *ecache, pszind_t ind) {
+	return eset_nbytes_get(&ecache->eset, ind)
+	    + eset_nbytes_get(&ecache->guarded_eset, ind);
+}
+
+static inline unsigned
+ecache_ind_get(ecache_t *ecache) {
+	return ecache->ind;
+}
+
+bool ecache_init(tsdn_t *tsdn, ecache_t *ecache, extent_state_t state,
+    unsigned ind, bool delay_coalesce);
+void ecache_prefork(tsdn_t *tsdn, ecache_t *ecache);
+void ecache_postfork_parent(tsdn_t *tsdn, ecache_t *ecache);
+void ecache_postfork_child(tsdn_t *tsdn, ecache_t *ecache);
+
+#endif /* JEMALLOC_INTERNAL_ECACHE_H */
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@ -0,0 +1,795 @@
+#ifndef JEMALLOC_INTERNAL_EDATA_H
+#define JEMALLOC_INTERNAL_EDATA_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bin_info.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/hpdata.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/prof_types.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/slab_data.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/typed_list.h"
+
+/*
+ * sizeof(edata_t) is 128 bytes on 64-bit architectures.  Ensure the alignment
+ * to free up the low bits in the rtree leaf.
+ */
+#define EDATA_ALIGNMENT 128
+
+/*
+ * Defines how many nodes visited when enumerating the heap to search for
+ * qualified extents.  More nodes visited may result in better choices at
+ * the cost of longer search time.  This size should not exceed 2^16 - 1
+ * because we use uint16_t for accessing the queue needed for enumeration.
+ */
+#define ESET_ENUMERATE_MAX_NUM 32
+
+enum extent_state_e {
+	extent_state_active = 0,
+	extent_state_dirty = 1,
+	extent_state_muzzy = 2,
+	extent_state_retained = 3,
+	extent_state_transition = 4, /* States below are intermediate. */
+	extent_state_merging = 5,
+	extent_state_max = 5 /* Sanity checking only. */
+};
+typedef enum extent_state_e extent_state_t;
+
+enum extent_head_state_e {
+	EXTENT_NOT_HEAD,
+	EXTENT_IS_HEAD /* See comments in ehooks_default_merge_impl(). */
+};
+typedef enum extent_head_state_e extent_head_state_t;
+
+/*
+ * Which implementation of the page allocator interface, (PAI, defined in
+ * pai.h) owns the given extent?
+ */
+enum extent_pai_e { EXTENT_PAI_PAC = 0, EXTENT_PAI_HPA = 1 };
+typedef enum extent_pai_e extent_pai_t;
+
+struct e_prof_info_s {
+	/* Time when this was allocated. */
+	nstime_t e_prof_alloc_time;
+	/* Allocation request size. */
+	size_t e_prof_alloc_size;
+	/* Points to a prof_tctx_t. */
+	atomic_p_t e_prof_tctx;
+	/*
+	 * Points to a prof_recent_t for the allocation; NULL
+	 * means the recent allocation record no longer exists.
+	 * Protected by prof_recent_alloc_mtx.
+	 */
+	atomic_p_t e_prof_recent_alloc;
+};
+typedef struct e_prof_info_s e_prof_info_t;
+
+/*
+ * The information about a particular edata that lives in an emap.  Space is
+ * more precious there (the information, plus the edata pointer, has to live in
+ * a 64-bit word if we want to enable a packed representation.
+ *
+ * There are two things that are special about the information here:
+ * - It's quicker to access.  You have one fewer pointer hop, since finding the
+ *   edata_t associated with an item always requires accessing the rtree leaf in
+ *   which this data is stored.
+ * - It can be read unsynchronized, and without worrying about lifetime issues.
+ */
+typedef struct edata_map_info_s edata_map_info_t;
+struct edata_map_info_s {
+	bool    slab;
+	szind_t szind;
+};
+
+typedef struct edata_cmp_summary_s edata_cmp_summary_t;
+struct edata_cmp_summary_s {
+	uint64_t  sn;
+	uintptr_t addr;
+};
+
+/* Extent (span of pages).  Use accessor functions for e_* fields. */
+typedef struct edata_s edata_t;
+ph_structs(edata_avail, edata_t, ESET_ENUMERATE_MAX_NUM);
+ph_structs(edata_heap, edata_t, ESET_ENUMERATE_MAX_NUM);
+struct edata_s {
+	/*
+	 * Bitfield containing several fields:
+	 *
+	 * a: arena_ind
+	 * b: slab
+	 * c: committed
+	 * p: pai
+	 * z: zeroed
+	 * g: guarded
+	 * t: state
+	 * i: szind
+	 * f: nfree
+	 * s: bin_shard
+	 *
+	 * 00000000 ... 0000ssss ssffffff ffffiiii iiiitttg zpcbaaaa aaaaaaaa
+	 *
+	 * arena_ind: Arena from which this extent came, or all 1 bits if
+	 *            unassociated.
+	 *
+	 * slab: The slab flag indicates whether the extent is used for a slab
+	 *       of small regions.  This helps differentiate small size classes,
+	 *       and it indicates whether interior pointers can be looked up via
+	 *       iealloc().
+	 *
+	 * committed: The committed flag indicates whether physical memory is
+	 *            committed to the extent, whether explicitly or implicitly
+	 *            as on a system that overcommits and satisfies physical
+	 *            memory needs on demand via soft page faults.
+	 *
+	 * pai: The pai flag is an extent_pai_t.
+	 *
+	 * zeroed: The zeroed flag is used by extent recycling code to track
+	 *         whether memory is zero-filled.
+	 *
+	 * guarded: The guarded flag is use by the sanitizer to track whether
+	 *          the extent has page guards around it.
+	 *
+	 * state: The state flag is an extent_state_t.
+	 *
+	 * szind: The szind flag indicates usable size class index for
+	 *        allocations residing in this extent, regardless of whether the
+	 *        extent is a slab.  Extent size and usable size often differ
+	 *        even for non-slabs, either due to sz_large_pad or promotion of
+	 *        sampled small regions.
+	 *
+	 * nfree: Number of free regions in slab.
+	 *
+	 * bin_shard: the shard of the bin from which this extent came.
+	 */
+	uint64_t e_bits;
+#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT)                         \
+	((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1))                   \
+	    << (CURRENT_FIELD_SHIFT))
+
+#define EDATA_BITS_ARENA_WIDTH MALLOCX_ARENA_BITS
+#define EDATA_BITS_ARENA_SHIFT 0
+#define EDATA_BITS_ARENA_MASK                                                  \
+	MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT)
+
+#define EDATA_BITS_SLAB_WIDTH 1
+#define EDATA_BITS_SLAB_SHIFT (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT)
+#define EDATA_BITS_SLAB_MASK MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT)
+
+#define EDATA_BITS_COMMITTED_WIDTH 1
+#define EDATA_BITS_COMMITTED_SHIFT                                             \
+	(EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT)
+#define EDATA_BITS_COMMITTED_MASK                                              \
+	MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT)
+
+#define EDATA_BITS_PAI_WIDTH 1
+#define EDATA_BITS_PAI_SHIFT                                                   \
+	(EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT)
+#define EDATA_BITS_PAI_MASK MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT)
+
+#define EDATA_BITS_ZEROED_WIDTH 1
+#define EDATA_BITS_ZEROED_SHIFT (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT)
+#define EDATA_BITS_ZEROED_MASK                                                 \
+	MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT)
+
+#define EDATA_BITS_GUARDED_WIDTH 1
+#define EDATA_BITS_GUARDED_SHIFT                                               \
+	(EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT)
+#define EDATA_BITS_GUARDED_MASK                                                \
+	MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT)
+
+#define EDATA_BITS_STATE_WIDTH 3
+#define EDATA_BITS_STATE_SHIFT                                                 \
+	(EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT)
+#define EDATA_BITS_STATE_MASK                                                  \
+	MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT)
+
+#define EDATA_BITS_SZIND_WIDTH LG_CEIL(SC_NSIZES)
+#define EDATA_BITS_SZIND_SHIFT (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT)
+#define EDATA_BITS_SZIND_MASK                                                  \
+	MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT)
+
+#define EDATA_BITS_NFREE_WIDTH (SC_LG_SLAB_MAXREGS + 1)
+#define EDATA_BITS_NFREE_SHIFT (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT)
+#define EDATA_BITS_NFREE_MASK                                                  \
+	MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT)
+
+#define EDATA_BITS_BINSHARD_WIDTH 6
+#define EDATA_BITS_BINSHARD_SHIFT                                              \
+	(EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT)
+#define EDATA_BITS_BINSHARD_MASK                                               \
+	MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT)
+
+#define EDATA_BITS_IS_HEAD_WIDTH 1
+#define EDATA_BITS_IS_HEAD_SHIFT                                               \
+	(EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT)
+#define EDATA_BITS_IS_HEAD_MASK                                                \
+	MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT)
+
+	/* Pointer to the extent that this structure is responsible for. */
+	void *e_addr;
+
+	union {
+		/*
+		 * Extent size and serial number associated with the extent
+		 * structure (different than the serial number for the extent at
+		 * e_addr).
+		 *
+		 * ssssssss [...] ssssssss ssssnnnn nnnnnnnn
+		 */
+		size_t e_size_esn;
+#define EDATA_SIZE_MASK ((size_t) ~(PAGE - 1))
+#define EDATA_ESN_MASK ((size_t)PAGE - 1)
+		/* Base extent size, which may not be a multiple of PAGE. */
+		size_t e_bsize;
+	};
+
+	/*
+	 * If this edata is a user allocation from an HPA, it comes out of some
+	 * pageslab (we don't yet support hugepage allocations that don't fit
+	 * into pageslabs).  This tracks it.
+	 */
+	hpdata_t *e_ps;
+
+	/*
+	 * Serial number.  These are not necessarily unique; splitting an extent
+	 * results in two extents with the same serial number.
+	 */
+	uint64_t e_sn;
+
+	union {
+		/*
+		 * List linkage used when the edata_t is active; either in
+		 * arena's large allocations or bin_t's slabs_full.
+		 */
+		ql_elm(edata_t) ql_link_active;
+		/*
+		 * Pairing heap linkage.  Used whenever the extent is inactive
+		 * (in the page allocators), or when it is active and in
+		 * slabs_nonfull, or when the edata_t is unassociated with an
+		 * extent and sitting in an edata_cache.
+		 */
+		union {
+			edata_heap_link_t  heap_link;
+			edata_avail_link_t avail_link;
+		};
+	};
+
+	union {
+		/*
+		 * List linkage used when the extent is inactive:
+		 * - Stashed dirty extents
+		 * - Ecache LRU functionality.
+		 */
+		ql_elm(edata_t) ql_link_inactive;
+		/* Small region slab metadata. */
+		slab_data_t e_slab_data;
+
+		/* Profiling data, used for large objects. */
+		e_prof_info_t e_prof_info;
+	};
+};
+
+TYPED_LIST(edata_list_active, edata_t, ql_link_active)
+TYPED_LIST(edata_list_inactive, edata_t, ql_link_inactive)
+
+static inline unsigned
+edata_arena_ind_get(const edata_t *edata) {
+	unsigned arena_ind = (unsigned)((edata->e_bits & EDATA_BITS_ARENA_MASK)
+	    >> EDATA_BITS_ARENA_SHIFT);
+	assert(arena_ind < MALLOCX_ARENA_LIMIT);
+
+	return arena_ind;
+}
+
+static inline szind_t
+edata_szind_get_maybe_invalid(const edata_t *edata) {
+	szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK)
+	    >> EDATA_BITS_SZIND_SHIFT);
+	assert(szind <= SC_NSIZES);
+	return szind;
+}
+
+static inline szind_t
+edata_szind_get(const edata_t *edata) {
+	szind_t szind = edata_szind_get_maybe_invalid(edata);
+	assert(szind < SC_NSIZES); /* Never call when "invalid". */
+	return szind;
+}
+
+static inline size_t
+edata_usize_get(const edata_t *edata) {
+	assert(edata != NULL);
+	/*
+	 * When sz_large_size_classes_disabled() is true, two cases:
+	 * 1. if usize_from_ind is not smaller than SC_LARGE_MINCLASS,
+	 * usize_from_size is accurate;
+	 * 2. otherwise, usize_from_ind is accurate.
+	 *
+	 * When sz_large_size_classes_disabled() is not true, the two should be the
+	 * same when usize_from_ind is not smaller than SC_LARGE_MINCLASS.
+	 *
+	 * Note sampled small allocs will be promoted.  Their extent size is
+	 * recorded in edata_size_get(edata), while their szind reflects the
+	 * true usize.  Thus, usize retrieved here is still accurate for
+	 * sampled small allocs.
+	 */
+	szind_t szind = edata_szind_get(edata);
+#ifdef JEMALLOC_JET
+	/*
+	 * Double free is invalid and results in undefined behavior.  However,
+	 * for double free tests to end gracefully, return an invalid usize
+	 * when szind shows the edata is not active, i.e., szind == SC_NSIZES.
+	 */
+	if (unlikely(szind == SC_NSIZES)) {
+		return SC_LARGE_MAXCLASS + 1;
+	}
+#endif
+
+	if (!sz_large_size_classes_disabled() || szind < SC_NBINS) {
+		size_t usize_from_ind = sz_index2size(szind);
+		if (!sz_large_size_classes_disabled()
+		    && usize_from_ind >= SC_LARGE_MINCLASS) {
+			size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+			assert(size > sz_large_pad);
+			size_t usize_from_size = size - sz_large_pad;
+			assert(usize_from_ind == usize_from_size);
+		}
+		return usize_from_ind;
+	}
+
+	size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+	assert(size > sz_large_pad);
+	size_t usize_from_size = size - sz_large_pad;
+	/*
+	 * no matter large size classes disabled or not, usize retrieved from
+	 * size is not accurate when smaller than SC_LARGE_MINCLASS.
+	 */
+	assert(usize_from_size >= SC_LARGE_MINCLASS);
+	return usize_from_size;
+}
+
+static inline unsigned
+edata_binshard_get(const edata_t *edata) {
+	unsigned binshard = (unsigned)((edata->e_bits
+	                                   & EDATA_BITS_BINSHARD_MASK)
+	    >> EDATA_BITS_BINSHARD_SHIFT);
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	return binshard;
+}
+
+static inline uint64_t
+edata_sn_get(const edata_t *edata) {
+	return edata->e_sn;
+}
+
+static inline extent_state_t
+edata_state_get(const edata_t *edata) {
+	return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK)
+	    >> EDATA_BITS_STATE_SHIFT);
+}
+
+static inline bool
+edata_guarded_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK)
+	    >> EDATA_BITS_GUARDED_SHIFT);
+}
+
+static inline bool
+edata_zeroed_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK)
+	    >> EDATA_BITS_ZEROED_SHIFT);
+}
+
+static inline bool
+edata_committed_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK)
+	    >> EDATA_BITS_COMMITTED_SHIFT);
+}
+
+static inline extent_pai_t
+edata_pai_get(const edata_t *edata) {
+	return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK)
+	    >> EDATA_BITS_PAI_SHIFT);
+}
+
+static inline bool
+edata_slab_get(const edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK)
+	    >> EDATA_BITS_SLAB_SHIFT);
+}
+
+static inline unsigned
+edata_nfree_get(const edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK)
+	    >> EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void *
+edata_base_get(const edata_t *edata) {
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr)
+	    || !edata_slab_get(edata));
+	return PAGE_ADDR2BASE(edata->e_addr);
+}
+
+static inline void *
+edata_addr_get(const edata_t *edata) {
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr)
+	    || !edata_slab_get(edata));
+	return edata->e_addr;
+}
+
+static inline size_t
+edata_size_get(const edata_t *edata) {
+	return (edata->e_size_esn & EDATA_SIZE_MASK);
+}
+
+static inline size_t
+edata_esn_get(const edata_t *edata) {
+	return (edata->e_size_esn & EDATA_ESN_MASK);
+}
+
+static inline size_t
+edata_bsize_get(const edata_t *edata) {
+	return edata->e_bsize;
+}
+
+static inline hpdata_t *
+edata_ps_get(const edata_t *edata) {
+	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
+	return edata->e_ps;
+}
+
+static inline void *
+edata_before_get(const edata_t *edata) {
+	return (void *)((byte_t *)edata_base_get(edata) - PAGE);
+}
+
+static inline void *
+edata_last_get(const edata_t *edata) {
+	return (void *)((byte_t *)edata_base_get(edata) + edata_size_get(edata)
+	    - PAGE);
+}
+
+static inline void *
+edata_past_get(const edata_t *edata) {
+	return (
+	    void *)((byte_t *)edata_base_get(edata) + edata_size_get(edata));
+}
+
+static inline slab_data_t *
+edata_slab_data_get(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return &edata->e_slab_data;
+}
+
+static inline const slab_data_t *
+edata_slab_data_get_const(const edata_t *edata) {
+	assert(edata_slab_get(edata));
+	return &edata->e_slab_data;
+}
+
+static inline prof_tctx_t *
+edata_prof_tctx_get(const edata_t *edata) {
+	return (prof_tctx_t *)atomic_load_p(
+	    &edata->e_prof_info.e_prof_tctx, ATOMIC_ACQUIRE);
+}
+
+static inline const nstime_t *
+edata_prof_alloc_time_get(const edata_t *edata) {
+	return &edata->e_prof_info.e_prof_alloc_time;
+}
+
+static inline size_t
+edata_prof_alloc_size_get(const edata_t *edata) {
+	return edata->e_prof_info.e_prof_alloc_size;
+}
+
+static inline prof_recent_t *
+edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) {
+	return (prof_recent_t *)atomic_load_p(
+	    &edata->e_prof_info.e_prof_recent_alloc, ATOMIC_RELAXED);
+}
+
+static inline void
+edata_arena_ind_set(edata_t *edata, unsigned arena_ind) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK)
+	    | ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT);
+}
+
+static inline void
+edata_binshard_set(edata_t *edata, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK)
+	    | ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT);
+}
+
+static inline void
+edata_addr_set(edata_t *edata, void *addr) {
+	edata->e_addr = addr;
+}
+
+static inline void
+edata_size_set(edata_t *edata, size_t size) {
+	assert((size & ~EDATA_SIZE_MASK) == 0);
+	edata->e_size_esn = size | (edata->e_size_esn & ~EDATA_SIZE_MASK);
+}
+
+static inline void
+edata_esn_set(edata_t *edata, size_t esn) {
+	edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK)
+	    | (esn & EDATA_ESN_MASK);
+}
+
+static inline void
+edata_bsize_set(edata_t *edata, size_t bsize) {
+	edata->e_bsize = bsize;
+}
+
+static inline void
+edata_ps_set(edata_t *edata, hpdata_t *ps) {
+	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
+	edata->e_ps = ps;
+}
+
+static inline void
+edata_szind_set(edata_t *edata, szind_t szind) {
+	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK)
+	    | ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT);
+}
+
+static inline void
+edata_nfree_set(edata_t *edata, unsigned nfree) {
+	assert(edata_slab_get(edata));
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK)
+	    | ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_binshard_set(edata_t *edata, unsigned nfree, unsigned binshard) {
+	/* The assertion assumes szind is set already. */
+	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
+	edata->e_bits = (edata->e_bits
+	                    & (~EDATA_BITS_NFREE_MASK
+	                        & ~EDATA_BITS_BINSHARD_MASK))
+	    | ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT)
+	    | ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_inc(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	edata->e_bits += ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_dec(edata_t *edata) {
+	assert(edata_slab_get(edata));
+	edata->e_bits -= ((uint64_t)1U << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_nfree_sub(edata_t *edata, uint64_t n) {
+	assert(edata_slab_get(edata));
+	edata->e_bits -= (n << EDATA_BITS_NFREE_SHIFT);
+}
+
+static inline void
+edata_sn_set(edata_t *edata, uint64_t sn) {
+	edata->e_sn = sn;
+}
+
+static inline void
+edata_state_set(edata_t *edata, extent_state_t state) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK)
+	    | ((uint64_t)state << EDATA_BITS_STATE_SHIFT);
+}
+
+static inline void
+edata_guarded_set(edata_t *edata, bool guarded) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK)
+	    | ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT);
+}
+
+static inline void
+edata_zeroed_set(edata_t *edata, bool zeroed) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK)
+	    | ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT);
+}
+
+static inline void
+edata_committed_set(edata_t *edata, bool committed) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK)
+	    | ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT);
+}
+
+static inline void
+edata_pai_set(edata_t *edata, extent_pai_t pai) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK)
+	    | ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
+}
+
+static inline void
+edata_slab_set(edata_t *edata, bool slab) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK)
+	    | ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT);
+}
+
+static inline void
+edata_prof_tctx_set(edata_t *edata, prof_tctx_t *tctx) {
+	atomic_store_p(&edata->e_prof_info.e_prof_tctx, tctx, ATOMIC_RELEASE);
+}
+
+static inline void
+edata_prof_alloc_time_set(edata_t *edata, nstime_t *t) {
+	nstime_copy(&edata->e_prof_info.e_prof_alloc_time, t);
+}
+
+static inline void
+edata_prof_alloc_size_set(edata_t *edata, size_t size) {
+	edata->e_prof_info.e_prof_alloc_size = size;
+}
+
+static inline void
+edata_prof_recent_alloc_set_dont_call_directly(
+    edata_t *edata, prof_recent_t *recent_alloc) {
+	atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc,
+	    ATOMIC_RELAXED);
+}
+
+static inline bool
+edata_is_head_get(edata_t *edata) {
+	return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK)
+	    >> EDATA_BITS_IS_HEAD_SHIFT);
+}
+
+static inline void
+edata_is_head_set(edata_t *edata, bool is_head) {
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK)
+	    | ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
+}
+
+static inline bool
+edata_state_in_transition(extent_state_t state) {
+	return state >= extent_state_transition;
+}
+
+/*
+ * Because this function is implemented as a sequence of bitfield modifications,
+ * even though each individual bit is properly initialized, we technically read
+ * uninitialized data within it.  This is mostly fine, since most callers get
+ * their edatas from zeroing sources, but callers who make stack edata_ts need
+ * to manually zero them.
+ */
+static inline void
+edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
+    bool slab, szind_t szind, uint64_t sn, extent_state_t state, bool zeroed,
+    bool committed, extent_pai_t pai, extent_head_state_t is_head) {
+	assert(addr == PAGE_ADDR2BASE(addr) || !slab);
+
+	edata_arena_ind_set(edata, arena_ind);
+	edata_addr_set(edata, addr);
+	edata_size_set(edata, size);
+	edata_slab_set(edata, slab);
+	edata_szind_set(edata, szind);
+	edata_sn_set(edata, sn);
+	edata_state_set(edata, state);
+	edata_guarded_set(edata, false);
+	edata_zeroed_set(edata, zeroed);
+	edata_committed_set(edata, committed);
+	edata_pai_set(edata, pai);
+	edata_is_head_set(edata, is_head == EXTENT_IS_HEAD);
+	if (config_prof) {
+		edata_prof_tctx_set(edata, NULL);
+	}
+}
+
+static inline void
+edata_binit(
+    edata_t *edata, void *addr, size_t bsize, uint64_t sn, bool reused) {
+	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
+	edata_addr_set(edata, addr);
+	edata_bsize_set(edata, bsize);
+	edata_slab_set(edata, false);
+	edata_szind_set(edata, SC_NSIZES);
+	edata_sn_set(edata, sn);
+	edata_state_set(edata, extent_state_active);
+	/* See comments in base_edata_is_reused. */
+	edata_guarded_set(edata, reused);
+	edata_zeroed_set(edata, true);
+	edata_committed_set(edata, true);
+	/*
+	 * This isn't strictly true, but base allocated extents never get
+	 * deallocated and can't be looked up in the emap, but no sense in
+	 * wasting a state bit to encode this fact.
+	 */
+	edata_pai_set(edata, EXTENT_PAI_PAC);
+}
+
+static inline int
+edata_esn_comp(const edata_t *a, const edata_t *b) {
+	size_t a_esn = edata_esn_get(a);
+	size_t b_esn = edata_esn_get(b);
+
+	return (a_esn > b_esn) - (a_esn < b_esn);
+}
+
+static inline int
+edata_ead_comp(const edata_t *a, const edata_t *b) {
+	uintptr_t a_eaddr = (uintptr_t)a;
+	uintptr_t b_eaddr = (uintptr_t)b;
+
+	return (a_eaddr > b_eaddr) - (a_eaddr < b_eaddr);
+}
+
+static inline edata_cmp_summary_t
+edata_cmp_summary_get(const edata_t *edata) {
+	edata_cmp_summary_t result;
+	result.sn = edata_sn_get(edata);
+	result.addr = (uintptr_t)edata_addr_get(edata);
+	return result;
+}
+
+#ifdef JEMALLOC_HAVE_INT128
+JEMALLOC_ALWAYS_INLINE unsigned __int128
+edata_cmp_summary_encode(edata_cmp_summary_t src) {
+	return ((unsigned __int128)src.sn << 64) | src.addr;
+}
+
+static inline int
+edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
+	unsigned __int128 a_encoded = edata_cmp_summary_encode(a);
+	unsigned __int128 b_encoded = edata_cmp_summary_encode(b);
+	if (a_encoded < b_encoded)
+		return -1;
+	if (a_encoded == b_encoded)
+		return 0;
+	return 1;
+}
+#else
+static inline int
+edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
+	/*
+	 * Logically, what we're doing here is comparing based on `.sn`, and
+	 * falling back to comparing on `.addr` in the case that `a.sn == b.sn`.
+	 * We accomplish this by multiplying the result of the `.sn` comparison
+	 * by 2, so that so long as it is not 0, it will dominate the `.addr`
+	 * comparison in determining the sign of the returned result value.
+	 * The justification for doing things this way is that this is
+	 * branchless - all of the branches that would be present in a
+	 * straightforward implementation are common cases, and thus the branch
+	 * prediction accuracy is not great. As a result, this implementation
+	 * is measurably faster (by around 30%).
+	 */
+	return (2 * ((a.sn > b.sn) - (a.sn < b.sn)))
+	    + ((a.addr > b.addr) - (a.addr < b.addr));
+}
+#endif
+
+static inline int
+edata_snad_comp(const edata_t *a, const edata_t *b) {
+	edata_cmp_summary_t a_cmp = edata_cmp_summary_get(a);
+	edata_cmp_summary_t b_cmp = edata_cmp_summary_get(b);
+
+	return edata_cmp_summary_comp(a_cmp, b_cmp);
+}
+
+static inline int
+edata_esnead_comp(const edata_t *a, const edata_t *b) {
+	/*
+	 * Similar to `edata_cmp_summary_comp`, we've opted for a
+	 * branchless implementation for the sake of performance.
+	 */
+	return (2 * edata_esn_comp(a, b)) + edata_ead_comp(a, b);
+}
+
+ph_proto(, edata_avail, edata_t) ph_proto(, edata_heap, edata_t)
+
+#endif /* JEMALLOC_INTERNAL_EDATA_H */
--- a/include/jemalloc/internal/edata_cache.h
+++ b/include/jemalloc/internal/edata_cache.h
@ -0,0 +1,50 @@
+#ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H
+#define JEMALLOC_INTERNAL_EDATA_CACHE_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+
+/* For tests only. */
+#define EDATA_CACHE_FAST_FILL 4
+
+/*
+ * A cache of edata_t structures allocated via base_alloc_edata (as opposed to
+ * the underlying extents they describe).  The contents of returned edata_t
+ * objects are garbage and cannot be relied upon.
+ */
+
+typedef struct edata_cache_s edata_cache_t;
+struct edata_cache_s {
+	edata_avail_t  avail;
+	atomic_zu_t    count;
+	malloc_mutex_t mtx;
+	base_t        *base;
+};
+
+bool     edata_cache_init(edata_cache_t *edata_cache, base_t *base);
+edata_t *edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata);
+
+void edata_cache_prefork(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_postfork_parent(tsdn_t *tsdn, edata_cache_t *edata_cache);
+void edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache);
+
+/*
+ * An edata_cache_small is like an edata_cache, but it relies on external
+ * synchronization and avoids first-fit strategies.
+ */
+
+typedef struct edata_cache_fast_s edata_cache_fast_t;
+struct edata_cache_fast_s {
+	edata_list_inactive_t list;
+	edata_cache_t        *fallback;
+	bool                  disabled;
+};
+
+void edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback);
+edata_t *edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs);
+void     edata_cache_fast_put(
+        tsdn_t *tsdn, edata_cache_fast_t *ecs, edata_t *edata);
+void edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs);
+
+#endif /* JEMALLOC_INTERNAL_EDATA_CACHE_H */
--- a/include/jemalloc/internal/ehooks.h
+++ b/include/jemalloc/internal/ehooks.h
@ -0,0 +1,414 @@
+#ifndef JEMALLOC_INTERNAL_EHOOKS_H
+#define JEMALLOC_INTERNAL_EHOOKS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/tsd.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/*
+ * This module is the internal interface to the extent hooks (both
+ * user-specified and external).  Eventually, this will give us the flexibility
+ * to use multiple different versions of user-visible extent-hook APIs under a
+ * single user interface.
+ *
+ * Current API expansions (not available to anyone but the default hooks yet):
+ *   - Head state tracking.  Hooks can decide whether or not to merge two
+ *     extents based on whether or not one of them is the head (i.e. was
+ *     allocated on its own).  The later extent loses its "head" status.
+ */
+
+extern const extent_hooks_t ehooks_default_extent_hooks;
+
+typedef struct ehooks_s ehooks_t;
+struct ehooks_s {
+	/*
+	 * The user-visible id that goes with the ehooks (i.e. that of the base
+	 * they're a part of, the associated arena's index within the arenas
+	 * array).
+	 */
+	unsigned ind;
+	/* Logically an extent_hooks_t *. */
+	atomic_p_t ptr;
+};
+
+extern const extent_hooks_t ehooks_default_extent_hooks;
+
+/*
+ * These are not really part of the public API.  Each hook has a fast-path for
+ * the default-hooks case that can avoid various small inefficiencies:
+ *   - Forgetting tsd and then calling tsd_get within the hook.
+ *   - Getting more state than necessary out of the extent_t.
+ *   - Doing arena_ind -> arena -> arena_ind lookups.
+ * By making the calls to these functions visible to the compiler, it can move
+ * those extra bits of computation down below the fast-paths where they get ignored.
+ */
+void *ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
+bool  ehooks_default_dalloc_impl(void *addr, size_t size);
+void  ehooks_default_destroy_impl(void *addr, size_t size);
+bool  ehooks_default_commit_impl(void *addr, size_t offset, size_t length);
+bool  ehooks_default_decommit_impl(void *addr, size_t offset, size_t length);
+#ifdef PAGES_CAN_PURGE_LAZY
+bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length);
+#endif
+#ifdef PAGES_CAN_PURGE_FORCED
+bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length);
+#endif
+bool ehooks_default_split_impl(void);
+/*
+ * Merge is the only default extent hook we declare -- see the comment in
+ * ehooks_merge.
+ */
+bool ehooks_default_merge(extent_hooks_t *extent_hooks, void *addr_a,
+    size_t size_a, void *addr_b, size_t size_b, bool committed,
+    unsigned arena_ind);
+bool ehooks_default_merge_impl(tsdn_t *tsdn, void *addr_a, void *addr_b);
+void ehooks_default_zero_impl(void *addr, size_t size);
+void ehooks_default_guard_impl(void *guard1, void *guard2);
+void ehooks_default_unguard_impl(void *guard1, void *guard2);
+
+/*
+ * We don't officially support reentrancy from wtihin the extent hooks.  But
+ * various people who sit within throwing distance of the jemalloc team want
+ * that functionality in certain limited cases.  The default reentrancy guards
+ * assert that we're not reentrant from a0 (since it's the bootstrap arena,
+ * where reentrant allocations would be redirected), which we would incorrectly
+ * trigger in cases where a0 has extent hooks (those hooks themselves can't be
+ * reentrant, then, but there are reasonable uses for such functionality, like
+ * putting internal metadata on hugepages).  Therefore, we use the raw
+ * reentrancy guards.
+ *
+ * Eventually, we need to think more carefully about whether and where we
+ * support allocating from within extent hooks (and what that means for things
+ * like profiling, stats collection, etc.), and document what the guarantee is.
+ */
+static inline void
+ehooks_pre_reentrancy(tsdn_t *tsdn) {
+	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
+	tsd_pre_reentrancy_raw(tsd);
+}
+
+static inline void
+ehooks_post_reentrancy(tsdn_t *tsdn) {
+	tsd_t *tsd = tsdn_null(tsdn) ? tsd_fetch() : tsdn_tsd(tsdn);
+	tsd_post_reentrancy_raw(tsd);
+}
+
+/* Beginning of the public API. */
+void ehooks_init(ehooks_t *ehooks, extent_hooks_t *extent_hooks, unsigned ind);
+
+static inline unsigned
+ehooks_ind_get(const ehooks_t *ehooks) {
+	return ehooks->ind;
+}
+
+static inline void
+ehooks_set_extent_hooks_ptr(ehooks_t *ehooks, extent_hooks_t *extent_hooks) {
+	atomic_store_p(&ehooks->ptr, extent_hooks, ATOMIC_RELEASE);
+}
+
+static inline extent_hooks_t *
+ehooks_get_extent_hooks_ptr(ehooks_t *ehooks) {
+	return (extent_hooks_t *)atomic_load_p(&ehooks->ptr, ATOMIC_ACQUIRE);
+}
+
+static inline bool
+ehooks_are_default(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks)
+	    == &ehooks_default_extent_hooks;
+}
+
+/*
+ * In some cases, a caller needs to allocate resources before attempting to call
+ * a hook.  If that hook is doomed to fail, this is wasteful.  We therefore
+ * include some checks for such cases.
+ */
+static inline bool
+ehooks_dalloc_will_fail(ehooks_t *ehooks) {
+	if (ehooks_are_default(ehooks)) {
+		return opt_retain;
+	} else {
+		return ehooks_get_extent_hooks_ptr(ehooks)->dalloc == NULL;
+	}
+}
+
+static inline bool
+ehooks_split_will_fail(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks)->split == NULL;
+}
+
+static inline bool
+ehooks_merge_will_fail(ehooks_t *ehooks) {
+	return ehooks_get_extent_hooks_ptr(ehooks)->merge == NULL;
+}
+
+static inline bool
+ehooks_guard_will_fail(ehooks_t *ehooks) {
+	/*
+	 * Before the guard hooks are officially introduced, limit the use to
+	 * the default hooks only.
+	 */
+	return !ehooks_are_default(ehooks);
+}
+
+/*
+ * Some hooks are required to return zeroed memory in certain situations.  In
+ * debug mode, we do some heuristic checks that they did what they were supposed
+ * to.
+ *
+ * This isn't really ehooks-specific (i.e. anyone can check for zeroed memory).
+ * But incorrect zero information indicates an ehook bug.
+ */
+static inline void
+ehooks_debug_zero_check(void *addr, size_t size) {
+	assert(((uintptr_t)addr & PAGE_MASK) == 0);
+	assert((size & PAGE_MASK) == 0);
+	assert(size > 0);
+	if (config_debug) {
+		/* Check the whole first page. */
+		size_t *p = (size_t *)addr;
+		for (size_t i = 0; i < PAGE / sizeof(size_t); i++) {
+			assert(p[i] == 0);
+		}
+		/*
+		 * And 4 spots within.  There's a tradeoff here; the larger
+		 * this number, the more likely it is that we'll catch a bug
+		 * where ehooks return a sparsely non-zero range.  But
+		 * increasing the number of checks also increases the number of
+		 * page faults in debug mode.  FreeBSD does much of their
+		 * day-to-day development work in debug mode, so we don't want
+		 * even the debug builds to be too slow.
+		 */
+		const size_t nchecks = 4;
+		assert(PAGE >= sizeof(size_t) * nchecks);
+		for (size_t i = 0; i < nchecks; ++i) {
+			assert(p[i * (size / sizeof(size_t) / nchecks)] == 0);
+		}
+	}
+}
+
+static inline void *
+ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size,
+    size_t alignment, bool *zero, bool *commit) {
+	bool            orig_zero = *zero;
+	void           *ret;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ret = ehooks_default_alloc_impl(tsdn, new_addr, size, alignment,
+		    zero, commit, ehooks_ind_get(ehooks));
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		ret = extent_hooks->alloc(extent_hooks, new_addr, size,
+		    alignment, zero, commit, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+	assert(new_addr == NULL || ret == NULL || new_addr == ret);
+	assert(!orig_zero || *zero);
+	if (*zero && ret != NULL) {
+		ehooks_debug_zero_check(ret, size);
+	}
+	return ret;
+}
+
+static inline bool
+ehooks_dalloc(
+    tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_dalloc_impl(addr, size);
+	} else if (extent_hooks->dalloc == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->dalloc(extent_hooks, addr, size,
+		    committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline void
+ehooks_destroy(
+    tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_destroy_impl(addr, size);
+	} else if (extent_hooks->destroy == NULL) {
+		/* Do nothing. */
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		extent_hooks->destroy(extent_hooks, addr, size, committed,
+		    ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+}
+
+static inline bool
+ehooks_commit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	bool            err;
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		err = ehooks_default_commit_impl(addr, offset, length);
+	} else if (extent_hooks->commit == NULL) {
+		err = true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		err = extent_hooks->commit(extent_hooks, addr, size, offset,
+		    length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+	}
+	if (!err) {
+		ehooks_debug_zero_check(addr, size);
+	}
+	return err;
+}
+
+static inline bool
+ehooks_decommit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_decommit_impl(addr, offset, length);
+	} else if (extent_hooks->decommit == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->decommit(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_purge_lazy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+#ifdef PAGES_CAN_PURGE_LAZY
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_purge_lazy_impl(addr, offset, length);
+	}
+#endif
+	if (extent_hooks->purge_lazy == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->purge_lazy(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_purge_forced(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t offset, size_t length) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	/*
+	 * It would be correct to have a ehooks_debug_zero_check call at the end
+	 * of this function; purge_forced is required to zero.  But checking
+	 * would touch the page in question, which may have performance
+	 * consequences (imagine the hooks are using hugepages, with a global
+	 * zero page off).  Even in debug mode, it's usually a good idea to
+	 * avoid cases that can dramatically increase memory consumption.
+	 */
+#ifdef PAGES_CAN_PURGE_FORCED
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_purge_forced_impl(addr, offset, length);
+	}
+#endif
+	if (extent_hooks->purge_forced == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->purge_forced(extent_hooks, addr, size,
+		    offset, length, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_split(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
+    size_t size_a, size_t size_b, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (ehooks_are_default(ehooks)) {
+		return ehooks_default_split_impl();
+	} else if (extent_hooks->split == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->split(extent_hooks, addr, size, size_a,
+		    size_b, committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline bool
+ehooks_merge(tsdn_t *tsdn, ehooks_t *ehooks, void *addr_a, size_t size_a,
+    void *addr_b, size_t size_b, bool committed) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		return ehooks_default_merge_impl(tsdn, addr_a, addr_b);
+	} else if (extent_hooks->merge == NULL) {
+		return true;
+	} else {
+		ehooks_pre_reentrancy(tsdn);
+		bool err = extent_hooks->merge(extent_hooks, addr_a, size_a,
+		    addr_b, size_b, committed, ehooks_ind_get(ehooks));
+		ehooks_post_reentrancy(tsdn);
+		return err;
+	}
+}
+
+static inline void
+ehooks_zero(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size) {
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_zero_impl(addr, size);
+	} else {
+		/*
+		 * It would be correct to try using the user-provided purge
+		 * hooks (since they are required to have zeroed the extent if
+		 * they indicate success), but we don't necessarily know their
+		 * cost.  We'll be conservative and use memset.
+		 */
+		memset(addr, 0, size);
+	}
+}
+
+static inline bool
+ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
+	bool            err;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_guard_impl(guard1, guard2);
+		err = false;
+	} else {
+		err = true;
+	}
+
+	return err;
+}
+
+static inline bool
+ehooks_unguard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
+	bool            err;
+	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
+
+	if (extent_hooks == &ehooks_default_extent_hooks) {
+		ehooks_default_unguard_impl(guard1, guard2);
+		err = false;
+	} else {
+		err = true;
+	}
+
+	return err;
+}
+
+#endif /* JEMALLOC_INTERNAL_EHOOKS_H */
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@ -0,0 +1,397 @@
+#ifndef JEMALLOC_INTERNAL_EMAP_H
+#define JEMALLOC_INTERNAL_EMAP_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/rtree.h"
+
+/*
+ * Note: Ends without at semicolon, so that
+ *     EMAP_DECLARE_RTREE_CTX;
+ * in uses will avoid empty-statement warnings.
+ */
+#define EMAP_DECLARE_RTREE_CTX                                                 \
+	rtree_ctx_t  rtree_ctx_fallback;                                       \
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
+
+typedef struct emap_s emap_t;
+struct emap_s {
+	rtree_t rtree;
+};
+
+/* Used to pass rtree lookup context down the path. */
+typedef struct emap_alloc_ctx_s emap_alloc_ctx_t;
+struct emap_alloc_ctx_s {
+	size_t  usize;
+	szind_t szind;
+	bool    slab;
+};
+
+typedef struct emap_full_alloc_ctx_s emap_full_alloc_ctx_t;
+struct emap_full_alloc_ctx_s {
+	szind_t  szind;
+	bool     slab;
+	edata_t *edata;
+};
+
+bool emap_init(emap_t *emap, base_t *base, bool zeroed);
+
+void emap_remap(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab);
+
+void emap_update_edata_state(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, extent_state_t state);
+
+/*
+ * The two acquire functions below allow accessing neighbor edatas, if it's safe
+ * and valid to do so (i.e. from the same arena, of the same state, etc.).  This
+ * is necessary because the ecache locks are state based, and only protect
+ * edatas with the same state.  Therefore the neighbor edata's state needs to be
+ * verified first, before chasing the edata pointer.  The returned edata will be
+ * in an acquired state, meaning other threads will be prevented from accessing
+ * it, even if technically the edata can still be discovered from the rtree.
+ *
+ * This means, at any moment when holding pointers to edata, either one of the
+ * state based locks is held (and the edatas are all of the protected state), or
+ * the edatas are in an acquired state (e.g. in active or merging state).  The
+ * acquire operation itself (changing the edata to an acquired state) is done
+ * under the state locks.
+ */
+edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap,
+    edata_t *edata, extent_pai_t pai, extent_state_t expected_state,
+    bool forward);
+edata_t *emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap,
+    edata_t *edata, extent_pai_t pai, extent_state_t expected_state);
+void     emap_release_edata(
+        tsdn_t *tsdn, emap_t *emap, edata_t *edata, extent_state_t new_state);
+
+/*
+ * Associate the given edata with its beginning and end address, setting the
+ * szind and slab info appropriately.
+ * Returns true on error (i.e. resource exhaustion).
+ */
+bool emap_register_boundary(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab);
+
+/*
+ * Does the same thing, but with the interior of the range, for slab
+ * allocations.
+ *
+ * You might wonder why we don't just have a single emap_register function that
+ * does both depending on the value of 'slab'.  The answer is twofold:
+ * - As a practical matter, in places like the extract->split->commit pathway,
+ *   we defer the interior operation until we're sure that the commit won't fail
+ *   (but we have to register the split boundaries there).
+ * - In general, we're trying to move to a world where the page-specific
+ *   allocator doesn't know as much about how the pages it allocates will be
+ *   used, and passing a 'slab' parameter everywhere makes that more
+ *   complicated.
+ *
+ * Unlike the boundary version, this function can't fail; this is because slabs
+ * can't get big enough to touch a new page that neither of the boundaries
+ * touched, so no allocation is necessary to fill the interior once the boundary
+ * has been touched.
+ */
+void emap_register_interior(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind);
+
+void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+void emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+
+typedef struct emap_prepare_s emap_prepare_t;
+struct emap_prepare_s {
+	rtree_leaf_elm_t *lead_elm_a;
+	rtree_leaf_elm_t *lead_elm_b;
+	rtree_leaf_elm_t *trail_elm_a;
+	rtree_leaf_elm_t *trail_elm_b;
+};
+
+/**
+ * These functions the emap metadata management for merging, splitting, and
+ * reusing extents.  In particular, they set the boundary mappings from
+ * addresses to edatas.  If the result is going to be used as a slab, you
+ * still need to call emap_register_interior on it, though.
+ *
+ * Remap simply changes the szind and slab status of an extent's boundary
+ * mappings.  If the extent is not a slab, it doesn't bother with updating the
+ * end mapping (since lookups only occur in the interior of an extent for
+ * slabs).  Since the szind and slab status only make sense for active extents,
+ * this should only be called while activating or deactivating an extent.
+ *
+ * Split and merge have a "prepare" and a "commit" portion.  The prepare portion
+ * does the operations that can be done without exclusive access to the extent
+ * in question, while the commit variant requires exclusive access to maintain
+ * the emap invariants.  The only function that can fail is emap_split_prepare,
+ * and it returns true on failure (at which point the caller shouldn't commit).
+ *
+ * In all cases, "lead" refers to the lower-addressed extent, and trail to the
+ * higher-addressed one.  It's the caller's responsibility to set the edata
+ * state appropriately.
+ */
+bool emap_split_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *edata, size_t size_a, edata_t *trail, size_t size_b);
+void emap_split_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, size_t size_a, edata_t *trail, size_t size_b);
+void emap_merge_prepare(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, edata_t *trail);
+void emap_merge_commit(tsdn_t *tsdn, emap_t *emap, emap_prepare_t *prepare,
+    edata_t *lead, edata_t *trail);
+
+/* Assert that the emap's view of the given edata matches the edata's view. */
+void emap_do_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+static inline void
+emap_assert_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (config_debug) {
+		emap_do_assert_mapped(tsdn, emap, edata);
+	}
+}
+
+/* Assert that the given edata isn't in the map. */
+void emap_do_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
+static inline void
+emap_assert_not_mapped(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (config_debug) {
+		emap_do_assert_not_mapped(tsdn, emap, edata);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	assert(config_debug);
+	emap_assert_mapped(tsdn, emap, edata);
+
+	EMAP_DECLARE_RTREE_CTX;
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)edata_base_get(edata));
+
+	return edata_state_in_transition(contents.metadata.state);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
+	if (!config_debug) {
+		/* For assertions only. */
+		return false;
+	}
+
+	/*
+	 * The edata is considered acquired if no other threads will attempt to
+	 * read / write any fields from it.  This includes a few cases:
+	 *
+	 * 1) edata not hooked into emap yet -- This implies the edata just got
+	 * allocated or initialized.
+	 *
+	 * 2) in an active or transition state -- In both cases, the edata can
+	 * be discovered from the emap, however the state tracked in the rtree
+	 * will prevent other threads from accessing the actual edata.
+	 */
+	EMAP_DECLARE_RTREE_CTX;
+	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
+	    rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ false,
+	    /* init_missing */ false);
+	if (elm == NULL) {
+		return true;
+	}
+	rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
+	    /* dependent */ false);
+	if (contents.edata == NULL
+	    || contents.metadata.state == extent_state_active
+	    || edata_state_in_transition(contents.metadata.state)) {
+		return true;
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) {
+	assert(edata_arena_ind_get(inner) == edata_arena_ind_get(outer));
+	assert(edata_pai_get(inner) == edata_pai_get(outer));
+	assert(edata_committed_get(inner) == edata_committed_get(outer));
+	assert(edata_state_get(inner) == extent_state_active);
+	assert(edata_state_get(outer) == extent_state_merging);
+	assert(!edata_guarded_get(inner) && !edata_guarded_get(outer));
+	assert(edata_base_get(inner) == edata_past_get(outer)
+	    || edata_base_get(outer) == edata_past_get(inner));
+}
+
+JEMALLOC_ALWAYS_INLINE void
+extent_assert_can_expand(const edata_t *original, const edata_t *expand) {
+	assert(edata_arena_ind_get(original) == edata_arena_ind_get(expand));
+	assert(edata_pai_get(original) == edata_pai_get(expand));
+	assert(edata_state_get(original) == extent_state_active);
+	assert(edata_state_get(expand) == extent_state_merging);
+	assert(edata_past_get(original) == edata_base_get(expand));
+}
+
+JEMALLOC_ALWAYS_INLINE edata_t *
+emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_init(
+    emap_alloc_ctx_t *alloc_ctx, szind_t szind, bool slab, size_t usize) {
+	alloc_ctx->szind = szind;
+	alloc_ctx->slab = slab;
+	alloc_ctx->usize = usize;
+	assert(
+	    sz_large_size_classes_disabled() || usize == sz_index2size(szind));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+emap_alloc_ctx_usize_get(emap_alloc_ctx_t *alloc_ctx) {
+	assert(alloc_ctx->szind < SC_NSIZES);
+	if (alloc_ctx->slab) {
+		assert(alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+		return sz_index2size(alloc_ctx->szind);
+	}
+	assert(sz_large_size_classes_disabled()
+	    || alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+	assert(alloc_ctx->usize <= SC_LARGE_MAXCLASS);
+	return alloc_ctx->usize;
+}
+
+/* Fills in alloc_ctx with the info in the map. */
+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_lookup(
+    tsdn_t *tsdn, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr);
+	/*
+	 * If the alloc is invalid, do not calculate usize since edata
+	 * could be corrupted.
+	 */
+	emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+	    contents.metadata.slab,
+	    (contents.metadata.szind == SC_NSIZES || contents.edata == NULL)
+	        ? 0
+	        : edata_usize_get(contents.edata));
+}
+
+/* The pointer must be mapped. */
+JEMALLOC_ALWAYS_INLINE void
+emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
+    emap_full_alloc_ctx_t *full_alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr);
+	full_alloc_ctx->edata = contents.edata;
+	full_alloc_ctx->szind = contents.metadata.szind;
+	full_alloc_ctx->slab = contents.metadata.slab;
+}
+
+/*
+ * The pointer is allowed to not be mapped.
+ *
+ * Returns true when the pointer is not present.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
+    emap_full_alloc_ctx_t *full_alloc_ctx) {
+	EMAP_DECLARE_RTREE_CTX;
+
+	rtree_contents_t contents;
+	bool             err = rtree_read_independent(
+            tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr, &contents);
+	if (err) {
+		return true;
+	}
+	full_alloc_ctx->edata = contents.edata;
+	full_alloc_ctx->szind = contents.metadata.szind;
+	full_alloc_ctx->slab = contents.metadata.slab;
+	return false;
+}
+
+/*
+ * Only used on the fastpath of free.  Returns true when cannot be fulfilled by
+ * fast path, e.g. when the metadata key is not cached.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+emap_alloc_ctx_try_lookup_fast(
+    tsd_t *tsd, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	/* Use the unsafe getter since this may gets called during exit. */
+	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd);
+
+	rtree_metadata_t metadata;
+	bool             err = rtree_metadata_try_read_fast(
+            tsd_tsdn(tsd), &emap->rtree, rtree_ctx, (uintptr_t)ptr, &metadata);
+	if (err) {
+		return true;
+	}
+	/*
+	 * Small allocs using the fastpath can always use index to get the
+	 * usize.  Therefore, do not set alloc_ctx->usize here.
+	 */
+	alloc_ctx->szind = metadata.szind;
+	alloc_ctx->slab = metadata.slab;
+	if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
+	return false;
+}
+
+/*
+ * We want to do batch lookups out of the cache bins, which use
+ * cache_bin_ptr_array_get to access the i'th element of the bin (since they
+ * invert usual ordering in deciding what to flush).  This lets the emap avoid
+ * caring about its caller's ordering.
+ */
+typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind);
+/*
+ * This allows size-checking assertions, which we can only do while we're in the
+ * process of edata lookups.
+ */
+typedef void (*emap_metadata_visitor)(
+    void *ctx, emap_full_alloc_ctx_t *alloc_ctx);
+
+typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t;
+union emap_batch_lookup_result_u {
+	edata_t          *edata;
+	rtree_leaf_elm_t *rtree_leaf;
+};
+
+JEMALLOC_ALWAYS_INLINE void
+emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs,
+    emap_ptr_getter ptr_getter, void *ptr_getter_ctx,
+    emap_metadata_visitor metadata_visitor, void *metadata_visitor_ctx,
+    emap_batch_lookup_result_t *result) {
+	/* Avoids null-checking tsdn in the loop below. */
+	util_assume(tsd != NULL);
+	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get(tsd);
+
+	for (size_t i = 0; i < nptrs; i++) {
+		const void *ptr = ptr_getter(ptr_getter_ctx, i);
+		/*
+		 * Reuse the edatas array as a temp buffer, lying a little about
+		 * the types.
+		 */
+		result[i].rtree_leaf = rtree_leaf_elm_lookup(tsd_tsdn(tsd),
+		    &emap->rtree, rtree_ctx, (uintptr_t)ptr,
+		    /* dependent */ true, /* init_missing */ false);
+	}
+
+	for (size_t i = 0; i < nptrs; i++) {
+		rtree_leaf_elm_t *elm = result[i].rtree_leaf;
+		rtree_contents_t  contents = rtree_leaf_elm_read(
+                    tsd_tsdn(tsd), &emap->rtree, elm, /* dependent */ true);
+		result[i].edata = contents.edata;
+		emap_full_alloc_ctx_t alloc_ctx;
+		/*
+		 * Not all these fields are read in practice by the metadata
+		 * visitor.  But the compiler can easily optimize away the ones
+		 * that aren't, so no sense in being incomplete.
+		 */
+		alloc_ctx.szind = contents.metadata.szind;
+		alloc_ctx.slab = contents.metadata.slab;
+		alloc_ctx.edata = contents.edata;
+		metadata_visitor(metadata_visitor_ctx, &alloc_ctx);
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_EMAP_H */
--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@ -0,0 +1,530 @@
+#ifndef JEMALLOC_INTERNAL_EMITTER_H
+#define JEMALLOC_INTERNAL_EMITTER_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/malloc_io.h"
+#include "jemalloc/internal/ql.h"
+
+typedef enum emitter_output_e emitter_output_t;
+enum emitter_output_e {
+	emitter_output_json,
+	emitter_output_json_compact,
+	emitter_output_table
+};
+
+typedef enum emitter_justify_e emitter_justify_t;
+enum emitter_justify_e {
+	emitter_justify_left,
+	emitter_justify_right,
+	/* Not for users; just to pass to internal functions. */
+	emitter_justify_none
+};
+
+typedef enum emitter_type_e emitter_type_t;
+enum emitter_type_e {
+	emitter_type_bool,
+	emitter_type_int,
+	emitter_type_int64,
+	emitter_type_unsigned,
+	emitter_type_uint32,
+	emitter_type_uint64,
+	emitter_type_size,
+	emitter_type_ssize,
+	emitter_type_string,
+	/*
+	 * A title is a column title in a table; it's just a string, but it's
+	 * not quoted.
+	 */
+	emitter_type_title,
+};
+
+typedef struct emitter_col_s emitter_col_t;
+struct emitter_col_s {
+	/* Filled in by the user. */
+	emitter_justify_t justify;
+	int               width;
+	emitter_type_t    type;
+	union {
+		bool        bool_val;
+		int         int_val;
+		unsigned    unsigned_val;
+		uint32_t    uint32_val;
+		uint32_t    uint32_t_val;
+		uint64_t    uint64_val;
+		uint64_t    uint64_t_val;
+		size_t      size_val;
+		ssize_t     ssize_val;
+		const char *str_val;
+	};
+
+	/* Filled in by initialization. */
+	ql_elm(emitter_col_t) link;
+};
+
+typedef struct emitter_row_s emitter_row_t;
+struct emitter_row_s {
+	ql_head(emitter_col_t) cols;
+};
+
+typedef struct emitter_s emitter_t;
+struct emitter_s {
+	emitter_output_t output;
+	/* The output information. */
+	write_cb_t *write_cb;
+	void       *cbopaque;
+	int         nesting_depth;
+	/* True if we've already emitted a value at the given depth. */
+	bool item_at_depth;
+	/* True if we emitted a key and will emit corresponding value next. */
+	bool emitted_key;
+};
+
+static inline bool
+emitter_outputs_json(emitter_t *emitter) {
+	return emitter->output == emitter_output_json
+	    || emitter->output == emitter_output_json_compact;
+}
+
+/* Internal convenience function.  Write to the emitter the given string. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_printf(emitter_t *emitter, const char *format, ...) {
+	va_list ap;
+
+	va_start(ap, format);
+	malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+	va_end(ap);
+}
+
+static inline const char *
+JEMALLOC_FORMAT_ARG(3) emitter_gen_fmt(char *out_fmt, size_t out_size,
+    const char *fmt_specifier, emitter_justify_t justify, int width) {
+	size_t written;
+	fmt_specifier++;
+	if (justify == emitter_justify_none) {
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%%s", fmt_specifier);
+	} else if (justify == emitter_justify_left) {
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%-%d%s", width, fmt_specifier);
+	} else {
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%%d%s", width, fmt_specifier);
+	}
+	/* Only happens in case of bad format string, which *we* choose. */
+	assert(written < out_size);
+	return out_fmt;
+}
+
+static inline void
+emitter_emit_str(emitter_t *emitter, emitter_justify_t justify, int width,
+    char *fmt, size_t fmt_size, const char *str) {
+#define BUF_SIZE 256
+	char   buf[BUF_SIZE];
+	size_t str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"", str);
+	emitter_printf(
+	    emitter, emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf);
+	if (str_written < BUF_SIZE) {
+		return;
+	}
+	/*
+	 * There is no support for long string justification at the moment as
+	 * we output them partially with multiple malloc_snprintf calls and
+	 * justufication will work correctly only withing one call.
+	 * Fortunately this is not a big concern as we don't use justufication
+	 * with long strings right now.
+	 *
+	 * We emitted leading quotation mark and trailing '\0', hence need to
+	 * exclude extra characters from str shift.
+	 */
+	str += BUF_SIZE - 2;
+	do {
+		str_written = malloc_snprintf(buf, BUF_SIZE, "%s\"", str);
+		str += str_written >= BUF_SIZE ? BUF_SIZE - 1 : str_written;
+		emitter_printf(emitter,
+		    emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf);
+	} while (str_written >= BUF_SIZE);
+#undef BUF_SIZE
+}
+
+/*
+ * Internal.  Emit the given value type in the relevant encoding (so that the
+ * bool true gets mapped to json "true", but the string "true" gets mapped to
+ * json "\"true\"", for instance.
+ *
+ * Width is ignored if justify is emitter_justify_none.
+ */
+static inline void
+emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
+    emitter_type_t value_type, const void *value) {
+#define FMT_SIZE 10
+	/*
+	 * We dynamically generate a format string to emit, to let us use the
+	 * snprintf machinery.  This is kinda hacky, but gets the job done
+	 * quickly without having to think about the various snprintf edge
+	 * cases.
+	 */
+	char fmt[FMT_SIZE];
+
+#define EMIT_SIMPLE(type, format)                                              \
+	emitter_printf(emitter,                                                \
+	    emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width),            \
+	    *(const type *)value);
+
+	switch (value_type) {
+	case emitter_type_bool:
+		emitter_printf(emitter,
+		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width),
+		    *(const bool *)value ? "true" : "false");
+		break;
+	case emitter_type_int:
+		EMIT_SIMPLE(int, "%d")
+		break;
+	case emitter_type_int64:
+		EMIT_SIMPLE(int64_t, "%" FMTd64)
+		break;
+	case emitter_type_unsigned:
+		EMIT_SIMPLE(unsigned, "%u")
+		break;
+	case emitter_type_ssize:
+		EMIT_SIMPLE(ssize_t, "%zd")
+		break;
+	case emitter_type_size:
+		EMIT_SIMPLE(size_t, "%zu")
+		break;
+	case emitter_type_string:
+		emitter_emit_str(emitter, justify, width, fmt, FMT_SIZE,
+		    *(const char *const *)value);
+		break;
+	case emitter_type_uint32:
+		EMIT_SIMPLE(uint32_t, "%" FMTu32)
+		break;
+	case emitter_type_uint64:
+		EMIT_SIMPLE(uint64_t, "%" FMTu64)
+		break;
+	case emitter_type_title:
+		EMIT_SIMPLE(char *const, "%s");
+		break;
+	default:
+		unreachable();
+	}
+#undef FMT_SIZE
+}
+
+/* Internal functions.  In json mode, tracks nesting state. */
+static inline void
+emitter_nest_inc(emitter_t *emitter) {
+	emitter->nesting_depth++;
+	emitter->item_at_depth = false;
+}
+
+static inline void
+emitter_nest_dec(emitter_t *emitter) {
+	emitter->nesting_depth--;
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_indent(emitter_t *emitter) {
+	int         amount = emitter->nesting_depth;
+	const char *indent_str;
+	assert(emitter->output != emitter_output_json_compact);
+	if (emitter->output == emitter_output_json) {
+		indent_str = "\t";
+	} else {
+		amount *= 2;
+		indent_str = " ";
+	}
+	for (int i = 0; i < amount; i++) {
+		emitter_printf(emitter, "%s", indent_str);
+	}
+}
+
+static inline void
+emitter_json_key_prefix(emitter_t *emitter) {
+	assert(emitter_outputs_json(emitter));
+	if (emitter->emitted_key) {
+		emitter->emitted_key = false;
+		return;
+	}
+	if (emitter->item_at_depth) {
+		emitter_printf(emitter, ",");
+	}
+	if (emitter->output != emitter_output_json_compact) {
+		emitter_printf(emitter, "\n");
+		emitter_indent(emitter);
+	}
+}
+
+/******************************************************************************/
+/* Public functions for emitter_t. */
+
+static inline void
+emitter_init(emitter_t *emitter, emitter_output_t emitter_output,
+    write_cb_t *write_cb, void *cbopaque) {
+	emitter->output = emitter_output;
+	emitter->write_cb = write_cb;
+	emitter->cbopaque = cbopaque;
+	emitter->item_at_depth = false;
+	emitter->emitted_key = false;
+	emitter->nesting_depth = 0;
+}
+
+/******************************************************************************/
+/* JSON public API. */
+
+/*
+ * Emits a key (e.g. as appears in an object). The next json entity emitted will
+ * be the corresponding value.
+ */
+static inline void
+emitter_json_key(emitter_t *emitter, const char *json_key) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "\"%s\":%s", json_key,
+		    emitter->output == emitter_output_json_compact ? "" : " ");
+		emitter->emitted_key = true;
+	}
+}
+
+static inline void
+emitter_json_value(
+    emitter_t *emitter, emitter_type_t value_type, const void *value) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_print_value(
+		    emitter, emitter_justify_none, -1, value_type, value);
+		emitter->item_at_depth = true;
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_value. */
+static inline void
+emitter_json_kv(emitter_t *emitter, const char *json_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_value(emitter, value_type, value);
+}
+
+static inline void
+emitter_json_array_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "[");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_array_begin. */
+static inline void
+emitter_json_array_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_array_begin(emitter);
+}
+
+static inline void
+emitter_json_array_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		if (emitter->output != emitter_output_json_compact) {
+			emitter_printf(emitter, "\n");
+			emitter_indent(emitter);
+		}
+		emitter_printf(emitter, "]");
+	}
+}
+
+static inline void
+emitter_json_object_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key_prefix(emitter);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	}
+}
+
+/* Shorthand for calling emitter_json_key and then emitter_json_object_begin. */
+static inline void
+emitter_json_object_kv_begin(emitter_t *emitter, const char *json_key) {
+	emitter_json_key(emitter, json_key);
+	emitter_json_object_begin(emitter);
+}
+
+static inline void
+emitter_json_object_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth > 0);
+		emitter_nest_dec(emitter);
+		if (emitter->output != emitter_output_json_compact) {
+			emitter_printf(emitter, "\n");
+			emitter_indent(emitter);
+		}
+		emitter_printf(emitter, "}");
+	}
+}
+
+/******************************************************************************/
+/* Table public API. */
+
+static inline void
+emitter_table_dict_begin(emitter_t *emitter, const char *table_key) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s\n", table_key);
+		emitter_nest_inc(emitter);
+	}
+}
+
+static inline void
+emitter_table_dict_end(emitter_t *emitter) {
+	if (emitter->output == emitter_output_table) {
+		emitter_nest_dec(emitter);
+	}
+}
+
+static inline void
+emitter_table_kv_note(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value, const char *table_note_key,
+    emitter_type_t table_note_value_type, const void *table_note_value) {
+	if (emitter->output == emitter_output_table) {
+		emitter_indent(emitter);
+		emitter_printf(emitter, "%s: ", table_key);
+		emitter_print_value(
+		    emitter, emitter_justify_none, -1, value_type, value);
+		if (table_note_key != NULL) {
+			emitter_printf(emitter, " (%s: ", table_note_key);
+			emitter_print_value(emitter, emitter_justify_none, -1,
+			    table_note_value_type, table_note_value);
+			emitter_printf(emitter, ")");
+		}
+		emitter_printf(emitter, "\n");
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_table_kv(emitter_t *emitter, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_table_kv_note(emitter, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+/* Write to the emitter the given string, but only in table mode. */
+JEMALLOC_FORMAT_PRINTF(2, 3)
+static inline void
+emitter_table_printf(emitter_t *emitter, const char *format, ...) {
+	if (emitter->output == emitter_output_table) {
+		va_list ap;
+		va_start(ap, format);
+		malloc_vcprintf(
+		    emitter->write_cb, emitter->cbopaque, format, ap);
+		va_end(ap);
+	}
+}
+
+static inline void
+emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
+	if (emitter->output != emitter_output_table) {
+		return;
+	}
+	emitter_col_t *col;
+	ql_foreach (col, &row->cols, link) {
+		emitter_print_value(emitter, col->justify, col->width,
+		    col->type, (const void *)&col->bool_val);
+	}
+	emitter_table_printf(emitter, "\n");
+}
+
+static inline void
+emitter_row_init(emitter_row_t *row) {
+	ql_new(&row->cols);
+}
+
+static inline void
+emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
+	ql_elm_new(col, link);
+	ql_tail_insert(&row->cols, col, link);
+}
+
+/******************************************************************************/
+/*
+ * Generalized public API. Emits using either JSON or table, according to
+ * settings in the emitter_t. */
+
+/*
+ * Note emits a different kv pair as well, but only in table mode.  Omits the
+ * note if table_note_key is NULL.
+ */
+static inline void
+emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value, const char *table_note_key,
+    emitter_type_t table_note_value_type, const void *table_note_value) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_value(emitter, value_type, value);
+	} else {
+		emitter_table_kv_note(emitter, table_key, value_type, value,
+		    table_note_key, table_note_value_type, table_note_value);
+	}
+	emitter->item_at_depth = true;
+}
+
+static inline void
+emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
+    emitter_type_t value_type, const void *value) {
+	emitter_kv_note(emitter, json_key, table_key, value_type, value, NULL,
+	    emitter_type_bool, NULL);
+}
+
+static inline void
+emitter_dict_begin(
+    emitter_t *emitter, const char *json_key, const char *table_header) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_key(emitter, json_key);
+		emitter_json_object_begin(emitter);
+	} else {
+		emitter_table_dict_begin(emitter, table_header);
+	}
+}
+
+static inline void
+emitter_dict_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		emitter_json_object_end(emitter);
+	} else {
+		emitter_table_dict_end(emitter);
+	}
+}
+
+static inline void
+emitter_begin(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth == 0);
+		emitter_printf(emitter, "{");
+		emitter_nest_inc(emitter);
+	} else {
+		/*
+		 * This guarantees that we always call write_cb at least once.
+		 * This is useful if some invariant is established by each call
+		 * to write_cb, but doesn't hold initially: e.g., some buffer
+		 * holds a null-terminated string.
+		 */
+		emitter_printf(emitter, "%s", "");
+	}
+}
+
+static inline void
+emitter_end(emitter_t *emitter) {
+	if (emitter_outputs_json(emitter)) {
+		assert(emitter->nesting_depth == 1);
+		emitter_nest_dec(emitter);
+		emitter_printf(emitter, "%s",
+		    emitter->output == emitter_output_json_compact ? "}"
+		                                                   : "\n}\n");
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_EMITTER_H */
--- a/include/jemalloc/internal/eset.h
+++ b/include/jemalloc/internal/eset.h
@ -0,0 +1,78 @@
+#ifndef JEMALLOC_INTERNAL_ESET_H
+#define JEMALLOC_INTERNAL_ESET_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/mutex.h"
+
+/*
+ * An eset ("extent set") is a quantized collection of extents, with built-in
+ * LRU queue.
+ *
+ * This class is not thread-safe; synchronization must be done externally if
+ * there are mutating operations.  One exception is the stats counters, which
+ * may be read without any locking.
+ */
+
+typedef struct eset_bin_s eset_bin_t;
+struct eset_bin_s {
+	edata_heap_t heap;
+	/*
+	 * We do first-fit across multiple size classes.  If we compared against
+	 * the min element in each heap directly, we'd take a cache miss per
+	 * extent we looked at.  If we co-locate the edata summaries, we only
+	 * take a miss on the edata we're actually going to return (which is
+	 * inevitable anyways).
+	 */
+	edata_cmp_summary_t heap_min;
+};
+
+typedef struct eset_bin_stats_s eset_bin_stats_t;
+struct eset_bin_stats_s {
+	atomic_zu_t nextents;
+	atomic_zu_t nbytes;
+};
+
+typedef struct eset_s eset_t;
+struct eset_s {
+	/* Bitmap for which set bits correspond to non-empty heaps. */
+	fb_group_t bitmap[FB_NGROUPS(SC_NPSIZES + 1)];
+
+	/* Quantized per size class heaps of extents. */
+	eset_bin_t bins[SC_NPSIZES + 1];
+
+	eset_bin_stats_t bin_stats[SC_NPSIZES + 1];
+
+	/* LRU of all extents in heaps. */
+	edata_list_inactive_t lru;
+
+	/* Page sum for all extents in heaps. */
+	atomic_zu_t npages;
+
+	/*
+	 * A duplication of the data in the containing ecache.  We use this only
+	 * for assertions on the states of the passed-in extents.
+	 */
+	extent_state_t state;
+};
+
+void eset_init(eset_t *eset, extent_state_t state);
+
+size_t eset_npages_get(eset_t *eset);
+/* Get the number of extents in the given page size index. */
+size_t eset_nextents_get(eset_t *eset, pszind_t ind);
+/* Get the sum total bytes of the extents in the given page size index. */
+size_t eset_nbytes_get(eset_t *eset, pszind_t ind);
+
+void eset_insert(eset_t *eset, edata_t *edata);
+void eset_remove(eset_t *eset, edata_t *edata);
+/*
+ * Select an extent from this eset of the given size and alignment.  Returns
+ * null if no such item could be found.
+ */
+edata_t *eset_fit(eset_t *eset, size_t esize, size_t alignment, bool exact_only,
+    unsigned lg_max_fit);
+
+#endif /* JEMALLOC_INTERNAL_ESET_H */
--- a/include/jemalloc/internal/exp_grow.h
+++ b/include/jemalloc/internal/exp_grow.h
@ -0,0 +1,50 @@
+#ifndef JEMALLOC_INTERNAL_EXP_GROW_H
+#define JEMALLOC_INTERNAL_EXP_GROW_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/sz.h"
+typedef struct exp_grow_s exp_grow_t;
+struct exp_grow_s {
+	/*
+	 * Next extent size class in a growing series to use when satisfying a
+	 * request via the extent hooks (only if opt_retain).  This limits the
+	 * number of disjoint virtual memory ranges so that extent merging can
+	 * be effective even if multiple arenas' extent allocation requests are
+	 * highly interleaved.
+	 *
+	 * retain_grow_limit is the max allowed size ind to expand (unless the
+	 * required size is greater).  Default is no limit, and controlled
+	 * through mallctl only.
+	 */
+	pszind_t next;
+	pszind_t limit;
+};
+
+static inline bool
+exp_grow_size_prepare(exp_grow_t *exp_grow, size_t alloc_size_min,
+    size_t *r_alloc_size, pszind_t *r_skip) {
+	*r_skip = 0;
+	*r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip);
+	while (*r_alloc_size < alloc_size_min) {
+		(*r_skip)++;
+		if (exp_grow->next + *r_skip >= sz_psz2ind(SC_LARGE_MAXCLASS)) {
+			/* Outside legal range. */
+			return true;
+		}
+		*r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip);
+	}
+	return false;
+}
+
+static inline void
+exp_grow_size_commit(exp_grow_t *exp_grow, pszind_t skip) {
+	if (exp_grow->next + skip + 1 <= exp_grow->limit) {
+		exp_grow->next += skip + 1;
+	} else {
+		exp_grow->next = exp_grow->limit;
+	}
+}
+
+void exp_grow_init(exp_grow_t *exp_grow);
+
+#endif /* JEMALLOC_INTERNAL_EXP_GROW_H */
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@ -1,239 +1,148 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
+#ifndef JEMALLOC_INTERNAL_EXTENT_H
+#define JEMALLOC_INTERNAL_EXTENT_H

-typedef struct extent_node_s extent_node_t;
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/ecache.h"
+#include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/pac.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/rtree.h"

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
+/*
+ * This module contains the page-level allocator.  It chooses the addresses that
+ * allocations requested by other modules will inhabit, and updates the global
+ * metadata to reflect allocation/deallocation/purging decisions.
+ */

-/* Tree of extents.  Use accessor functions for en_* fields. */
-struct extent_node_s {
-	/* Arena from which this extent came, if any. */
-	arena_t			*en_arena;
+/*
+ * When reuse (and split) an active extent, (1U << opt_lg_extent_max_active_fit)
+ * is the max ratio between the size of the active extent and the new extent.
+ */
+#define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
+extern size_t opt_lg_extent_max_active_fit;

-	/* Pointer to the extent that this tree node is responsible for. */
-	void			*en_addr;
+#define PROCESS_MADVISE_MAX_BATCH_DEFAULT 0
+extern size_t opt_process_madvise_max_batch;

-	/* Total region size. */
-	size_t			en_size;
-
-	/*
-	 * The zeroed flag is used by chunk recycling code to track whether
-	 * memory is zero-filled.
-	 */
-	bool			en_zeroed;
-
-	/*
-	 * True if physical memory is committed to the extent, whether
-	 * explicitly or implicitly as on a system that overcommits and
-	 * satisfies physical memory needs on demand via soft page faults.
-	 */
-	bool			en_committed;
-
-	/*
-	 * The achunk flag is used to validate that huge allocation lookups
-	 * don't return arena chunks.
-	 */
-	bool			en_achunk;
-
-	/* Profile counters, used for huge objects. */
-	prof_tctx_t		*en_prof_tctx;
-
-	/* Linkage for arena's runs_dirty and chunks_cache rings. */
-	arena_runs_dirty_link_t	rd;
-	qr(extent_node_t)	cc_link;
-
-	union {
-		/* Linkage for the size/address-ordered tree. */
-		rb_node(extent_node_t)	szad_link;
-
-		/* Linkage for arena's achunks, huge, and node_cache lists. */
-		ql_elm(extent_node_t)	ql_link;
-	};
-
-	/* Linkage for the address-ordered tree. */
-	rb_node(extent_node_t)	ad_link;
-};
-typedef rb_tree(extent_node_t) extent_tree_t;
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-rb_proto(, extent_tree_szad_, extent_tree_t, extent_node_t)
-
-rb_proto(, extent_tree_ad_, extent_tree_t, extent_node_t)
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#ifndef JEMALLOC_ENABLE_INLINE
-arena_t	*extent_node_arena_get(const extent_node_t *node);
-void	*extent_node_addr_get(const extent_node_t *node);
-size_t	extent_node_size_get(const extent_node_t *node);
-bool	extent_node_zeroed_get(const extent_node_t *node);
-bool	extent_node_committed_get(const extent_node_t *node);
-bool	extent_node_achunk_get(const extent_node_t *node);
-prof_tctx_t	*extent_node_prof_tctx_get(const extent_node_t *node);
-void	extent_node_arena_set(extent_node_t *node, arena_t *arena);
-void	extent_node_addr_set(extent_node_t *node, void *addr);
-void	extent_node_size_set(extent_node_t *node, size_t size);
-void	extent_node_zeroed_set(extent_node_t *node, bool zeroed);
-void	extent_node_committed_set(extent_node_t *node, bool committed);
-void	extent_node_achunk_set(extent_node_t *node, bool achunk);
-void	extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx);
-void	extent_node_init(extent_node_t *node, arena_t *arena, void *addr,
-    size_t size, bool zeroed, bool committed);
-void	extent_node_dirty_linkage_init(extent_node_t *node);
-void	extent_node_dirty_insert(extent_node_t *node,
-    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty);
-void	extent_node_dirty_remove(extent_node_t *node);
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+/* The iovec is on stack.  Limit the max batch to avoid stack overflow. */
+#	define PROCESS_MADVISE_MAX_BATCH_LIMIT                                \
+		(VARIABLE_ARRAY_SIZE_MAX / sizeof(struct iovec))
+#else
+#	define PROCESS_MADVISE_MAX_BATCH_LIMIT 0
 #endif

-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_EXTENT_C_))
-JEMALLOC_INLINE arena_t *
-extent_node_arena_get(const extent_node_t *node)
-{
+edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
+    bool zero, bool guarded);
+edata_t *ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
+    bool zero, bool guarded);
+void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
+    edata_t *edata);
+edata_t *ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    ecache_t *ecache, size_t npages_min);

-	return (node->en_arena);
+void extent_gdump_add(tsdn_t *tsdn, const edata_t *edata);
+void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
+    edata_t *edata);
+void extent_dalloc_gap(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    void *new_addr, size_t size, size_t alignment, bool zero, bool *commit,
+    bool growing_retained);
+void     extent_dalloc_wrapper(
+        tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+void extent_dalloc_wrapper_purged(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+void extent_destroy_wrapper(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+    size_t offset, size_t length);
+edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks);
+bool     extent_merge_wrapper(
+        tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, edata_t *b);
+bool   extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+      bool commit, bool zero, bool growing_retained);
+size_t extent_sn_next(pac_t *pac);
+bool   extent_boot(void);
+
+JEMALLOC_ALWAYS_INLINE bool
+extent_neighbor_head_state_mergeable(
+    bool edata_is_head, bool neighbor_is_head, bool forward) {
+	/*
+	 * Head states checking: disallow merging if the higher addr extent is a
+	 * head extent.  This helps preserve first-fit, and more importantly
+	 * makes sure no merge across arenas.
+	 */
+	if (forward) {
+		if (neighbor_is_head) {
+			return false;
+		}
+	} else {
+		if (edata_is_head) {
+			return false;
+		}
+	}
+	return true;
 }

-JEMALLOC_INLINE void *
-extent_node_addr_get(const extent_node_t *node)
-{
+JEMALLOC_ALWAYS_INLINE bool
+extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
+    extent_pai_t pai, extent_state_t expected_state, bool forward,
+    bool expanding) {
+	edata_t *neighbor = contents.edata;
+	if (neighbor == NULL) {
+		return false;
+	}
+	/* It's not safe to access *neighbor yet; must verify states first. */
+	bool neighbor_is_head = contents.metadata.is_head;
+	if (!extent_neighbor_head_state_mergeable(
+	        edata_is_head_get(edata), neighbor_is_head, forward)) {
+		return false;
+	}
+	extent_state_t neighbor_state = contents.metadata.state;
+	if (pai == EXTENT_PAI_PAC) {
+		if (neighbor_state != expected_state) {
+			return false;
+		}
+		/* From this point, it's safe to access *neighbor. */
+		if (!expanding
+		    && (edata_committed_get(edata)
+		        != edata_committed_get(neighbor))) {
+			/*
+			 * Some platforms (e.g. Windows) require an explicit
+			 * commit step (and writing to uncommitted memory is not
+			 * allowed).
+			 */
+			return false;
+		}
+	} else {
+		if (neighbor_state == extent_state_active) {
+			return false;
+		}
+		/* From this point, it's safe to access *neighbor. */
+	}

-	return (node->en_addr);
+	assert(edata_pai_get(edata) == pai);
+	if (edata_pai_get(neighbor) != pai) {
+		return false;
+	}
+	if (opt_retain) {
+		assert(edata_arena_ind_get(edata)
+		    == edata_arena_ind_get(neighbor));
+	} else {
+		if (edata_arena_ind_get(edata)
+		    != edata_arena_ind_get(neighbor)) {
+			return false;
+		}
+	}
+	assert(!edata_guarded_get(edata) && !edata_guarded_get(neighbor));
+
+	return true;
 }

-JEMALLOC_INLINE size_t
-extent_node_size_get(const extent_node_t *node)
-{
-
-	return (node->en_size);
-}
-
-JEMALLOC_INLINE bool
-extent_node_zeroed_get(const extent_node_t *node)
-{
-
-	return (node->en_zeroed);
-}
-
-JEMALLOC_INLINE bool
-extent_node_committed_get(const extent_node_t *node)
-{
-
-	assert(!node->en_achunk);
-	return (node->en_committed);
-}
-
-JEMALLOC_INLINE bool
-extent_node_achunk_get(const extent_node_t *node)
-{
-
-	return (node->en_achunk);
-}
-
-JEMALLOC_INLINE prof_tctx_t *
-extent_node_prof_tctx_get(const extent_node_t *node)
-{
-
-	return (node->en_prof_tctx);
-}
-
-JEMALLOC_INLINE void
-extent_node_arena_set(extent_node_t *node, arena_t *arena)
-{
-
-	node->en_arena = arena;
-}
-
-JEMALLOC_INLINE void
-extent_node_addr_set(extent_node_t *node, void *addr)
-{
-
-	node->en_addr = addr;
-}
-
-JEMALLOC_INLINE void
-extent_node_size_set(extent_node_t *node, size_t size)
-{
-
-	node->en_size = size;
-}
-
-JEMALLOC_INLINE void
-extent_node_zeroed_set(extent_node_t *node, bool zeroed)
-{
-
-	node->en_zeroed = zeroed;
-}
-
-JEMALLOC_INLINE void
-extent_node_committed_set(extent_node_t *node, bool committed)
-{
-
-	node->en_committed = committed;
-}
-
-JEMALLOC_INLINE void
-extent_node_achunk_set(extent_node_t *node, bool achunk)
-{
-
-	node->en_achunk = achunk;
-}
-
-JEMALLOC_INLINE void
-extent_node_prof_tctx_set(extent_node_t *node, prof_tctx_t *tctx)
-{
-
-	node->en_prof_tctx = tctx;
-}
-
-JEMALLOC_INLINE void
-extent_node_init(extent_node_t *node, arena_t *arena, void *addr, size_t size,
-    bool zeroed, bool committed)
-{
-
-	extent_node_arena_set(node, arena);
-	extent_node_addr_set(node, addr);
-	extent_node_size_set(node, size);
-	extent_node_zeroed_set(node, zeroed);
-	extent_node_committed_set(node, committed);
-	extent_node_achunk_set(node, false);
-	if (config_prof)
-		extent_node_prof_tctx_set(node, NULL);
-}
-
-JEMALLOC_INLINE void
-extent_node_dirty_linkage_init(extent_node_t *node)
-{
-
-	qr_new(&node->rd, rd_link);
-	qr_new(node, cc_link);
-}
-
-JEMALLOC_INLINE void
-extent_node_dirty_insert(extent_node_t *node,
-    arena_runs_dirty_link_t *runs_dirty, extent_node_t *chunks_dirty)
-{
-
-	qr_meld(runs_dirty, &node->rd, rd_link);
-	qr_meld(chunks_dirty, node, cc_link);
-}
-
-JEMALLOC_INLINE void
-extent_node_dirty_remove(extent_node_t *node)
-{
-
-	qr_remove(&node->rd, rd_link);
-	qr_remove(node, cc_link);
-}
-
-#endif
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
-
+#endif /* JEMALLOC_INTERNAL_EXTENT_H */
--- a/include/jemalloc/internal/extent_dss.h
+++ b/include/jemalloc/internal/extent_dss.h
@ -0,0 +1,30 @@
+#ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H
+#define JEMALLOC_INTERNAL_EXTENT_DSS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
+typedef enum {
+	dss_prec_disabled = 0,
+	dss_prec_primary = 1,
+	dss_prec_secondary = 2,
+
+	dss_prec_limit = 3
+} dss_prec_t;
+#define DSS_PREC_DEFAULT dss_prec_secondary
+#define DSS_DEFAULT "secondary"
+
+extern const char *const dss_prec_names[];
+
+extern const char *opt_dss;
+
+dss_prec_t extent_dss_prec_get(void);
+bool       extent_dss_prec_set(dss_prec_t dss_prec);
+void      *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
+         size_t size, size_t alignment, bool *zero, bool *commit);
+bool       extent_in_dss(void *addr);
+bool       extent_dss_mergeable(void *addr_a, void *addr_b);
+void       extent_dss_boot(void);
+
+#endif /* JEMALLOC_INTERNAL_EXTENT_DSS_H */
--- a/include/jemalloc/internal/extent_mmap.h
+++ b/include/jemalloc/internal/extent_mmap.h
@ -0,0 +1,12 @@
+#ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
+#define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+extern bool opt_retain;
+
+void *extent_alloc_mmap(
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
+bool extent_dalloc_mmap(void *addr, size_t size);
+
+#endif /* JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H */
--- a/include/jemalloc/internal/fb.h
+++ b/include/jemalloc/internal/fb.h
@ -0,0 +1,378 @@
+#ifndef JEMALLOC_INTERNAL_FB_H
+#define JEMALLOC_INTERNAL_FB_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+
+/*
+ * The flat bitmap module.  This has a larger API relative to the bitmap module
+ * (supporting things like backwards searches, and searching for both set and
+ * unset bits), at the cost of slower operations for very large bitmaps.
+ *
+ * Initialized flat bitmaps start at all-zeros (all bits unset).
+ */
+
+typedef unsigned long fb_group_t;
+#define FB_GROUP_BITS (ZU(1) << (LG_SIZEOF_LONG + 3))
+#define FB_NGROUPS(nbits)                                                      \
+	((nbits) / FB_GROUP_BITS + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1))
+
+static inline void
+fb_init(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	memset(fb, 0, ngroups * sizeof(fb_group_t));
+}
+
+static inline bool
+fb_empty(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		if (fb[i] != 0) {
+			return false;
+		}
+	}
+	return true;
+}
+
+static inline bool
+fb_full(fb_group_t *fb, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	size_t trailing_bits = nbits % FB_GROUP_BITS;
+	size_t limit = (trailing_bits == 0 ? ngroups : ngroups - 1);
+	for (size_t i = 0; i < limit; i++) {
+		if (fb[i] != ~(fb_group_t)0) {
+			return false;
+		}
+	}
+	if (trailing_bits == 0) {
+		return true;
+	}
+	return fb[ngroups - 1] == ((fb_group_t)1 << trailing_bits) - 1;
+}
+
+static inline bool
+fb_get(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	return (bool)(fb[group_ind] & ((fb_group_t)1 << bit_ind));
+}
+
+static inline void
+fb_set(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	fb[group_ind] |= ((fb_group_t)1 << bit_ind);
+}
+
+static inline void
+fb_unset(fb_group_t *fb, size_t nbits, size_t bit) {
+	assert(bit < nbits);
+	size_t group_ind = bit / FB_GROUP_BITS;
+	size_t bit_ind = bit % FB_GROUP_BITS;
+	fb[group_ind] &= ~((fb_group_t)1 << bit_ind);
+}
+
+/*
+ * Some implementation details.  This visitation function lets us apply a group
+ * visitor to each group in the bitmap (potentially modifying it).  The mask
+ * indicates which bits are logically part of the visitation.
+ */
+typedef void (*fb_group_visitor_t)(void *ctx, fb_group_t *fb, fb_group_t mask);
+JEMALLOC_ALWAYS_INLINE void
+fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx,
+    size_t start, size_t cnt) {
+	assert(cnt > 0);
+	assert(start + cnt <= nbits);
+	size_t group_ind = start / FB_GROUP_BITS;
+	size_t start_bit_ind = start % FB_GROUP_BITS;
+	/*
+	 * The first group is special; it's the only one we don't start writing
+	 * to from bit 0.
+	 */
+	size_t first_group_cnt = (start_bit_ind + cnt > FB_GROUP_BITS
+	        ? FB_GROUP_BITS - start_bit_ind
+	        : cnt);
+	/*
+	 * We can basically split affected words into:
+	 *   - The first group, where we touch only the high bits
+	 *   - The last group, where we touch only the low bits
+	 *   - The middle, where we set all the bits to the same thing.
+	 * We treat each case individually.  The last two could be merged, but
+	 * this can lead to bad codegen for those middle words.
+	 */
+	/* First group */
+	fb_group_t mask =
+	    ((~(fb_group_t)0) >> (FB_GROUP_BITS - first_group_cnt))
+	    << start_bit_ind;
+	visit(ctx, &fb[group_ind], mask);
+
+	cnt -= first_group_cnt;
+	group_ind++;
+	/* Middle groups */
+	while (cnt > FB_GROUP_BITS) {
+		visit(ctx, &fb[group_ind], ~(fb_group_t)0);
+		cnt -= FB_GROUP_BITS;
+		group_ind++;
+	}
+	/* Last group */
+	if (cnt != 0) {
+		mask = (~(fb_group_t)0) >> (FB_GROUP_BITS - cnt);
+		visit(ctx, &fb[group_ind], mask);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fb_assign_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) {
+	bool val = *(bool *)ctx;
+	if (val) {
+		*fb |= mask;
+	} else {
+		*fb &= ~mask;
+	}
+}
+
+/* Sets the cnt bits starting at position start.  Must not have a 0 count. */
+static inline void
+fb_set_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	bool val = true;
+	fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt);
+}
+
+/* Unsets the cnt bits starting at position start.  Must not have a 0 count. */
+static inline void
+fb_unset_range(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	bool val = false;
+	fb_visit_impl(fb, nbits, &fb_assign_visitor, &val, start, cnt);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fb_scount_visitor(void *ctx, fb_group_t *fb, fb_group_t mask) {
+	size_t *scount = (size_t *)ctx;
+	*scount += popcount_lu(*fb & mask);
+}
+
+/* Finds the number of set bit in the of length cnt starting at start. */
+JEMALLOC_ALWAYS_INLINE size_t
+fb_scount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	size_t scount = 0;
+	fb_visit_impl(fb, nbits, &fb_scount_visitor, &scount, start, cnt);
+	return scount;
+}
+
+/* Finds the number of unset bit in the of length cnt starting at start. */
+JEMALLOC_ALWAYS_INLINE size_t
+fb_ucount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
+	size_t scount = fb_scount(fb, nbits, start, cnt);
+	return cnt - scount;
+}
+
+/*
+ * An implementation detail; find the first bit at position >= min_bit with the
+ * value val.
+ *
+ * Returns the number of bits in the bitmap if no such bit exists.
+ */
+JEMALLOC_ALWAYS_INLINE ssize_t
+fb_find_impl(
+    fb_group_t *fb, size_t nbits, size_t start, bool val, bool forward) {
+	assert(start < nbits);
+	size_t  ngroups = FB_NGROUPS(nbits);
+	ssize_t group_ind = start / FB_GROUP_BITS;
+	size_t  bit_ind = start % FB_GROUP_BITS;
+
+	fb_group_t maybe_invert = (val ? 0 : (fb_group_t)-1);
+
+	fb_group_t group = fb[group_ind];
+	group ^= maybe_invert;
+	if (forward) {
+		/* Only keep ones in bits bit_ind and above. */
+		group &= ~((1LU << bit_ind) - 1);
+	} else {
+		/*
+		 * Only keep ones in bits bit_ind and below.  You might more
+		 * naturally express this as (1 << (bit_ind + 1)) - 1, but
+		 * that shifts by an invalid amount if bit_ind is one less than
+		 * FB_GROUP_BITS.
+		 */
+		group &= ((2LU << bit_ind) - 1);
+	}
+	ssize_t group_ind_bound = forward ? (ssize_t)ngroups : -1;
+	while (group == 0) {
+		group_ind += forward ? 1 : -1;
+		if (group_ind == group_ind_bound) {
+			return forward ? (ssize_t)nbits : (ssize_t)-1;
+		}
+		group = fb[group_ind];
+		group ^= maybe_invert;
+	}
+	assert(group != 0);
+	size_t bit = forward ? ffs_lu(group) : fls_lu(group);
+	size_t pos = group_ind * FB_GROUP_BITS + bit;
+	/*
+	 * The high bits of a partially filled last group are zeros, so if we're
+	 * looking for zeros we don't want to report an invalid result.
+	 */
+	if (forward && !val && pos > nbits) {
+		return nbits;
+	}
+	return pos;
+}
+
+/*
+ * Find the first set bit in the bitmap with an index >= min_bit.  Returns the
+ * number of bits in the bitmap if no such bit exists.
+ */
+static inline size_t
+fb_ffu(fb_group_t *fb, size_t nbits, size_t min_bit) {
+	return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ false,
+	    /* forward */ true);
+}
+
+/* The same, but looks for an unset bit. */
+static inline size_t
+fb_ffs(fb_group_t *fb, size_t nbits, size_t min_bit) {
+	return (size_t)fb_find_impl(fb, nbits, min_bit, /* val */ true,
+	    /* forward */ true);
+}
+
+/*
+ * Find the last set bit in the bitmap with an index <= max_bit.  Returns -1 if
+ * no such bit exists.
+ */
+static inline ssize_t
+fb_flu(fb_group_t *fb, size_t nbits, size_t max_bit) {
+	return fb_find_impl(fb, nbits, max_bit, /* val */ false,
+	    /* forward */ false);
+}
+
+static inline ssize_t
+fb_fls(fb_group_t *fb, size_t nbits, size_t max_bit) {
+	return fb_find_impl(fb, nbits, max_bit, /* val */ true,
+	    /* forward */ false);
+}
+
+/* Returns whether or not we found a range. */
+JEMALLOC_ALWAYS_INLINE bool
+fb_iter_range_impl(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len, bool val, bool forward) {
+	assert(start < nbits);
+	ssize_t next_range_begin = fb_find_impl(fb, nbits, start, val, forward);
+	if ((forward && next_range_begin == (ssize_t)nbits)
+	    || (!forward && next_range_begin == (ssize_t)-1)) {
+		return false;
+	}
+	/* Half open range; the set bits are [begin, end). */
+	ssize_t next_range_end = fb_find_impl(
+	    fb, nbits, next_range_begin, !val, forward);
+	if (forward) {
+		*r_begin = next_range_begin;
+		*r_len = next_range_end - next_range_begin;
+	} else {
+		*r_begin = next_range_end + 1;
+		*r_len = next_range_begin - next_range_end;
+	}
+	return true;
+}
+
+/*
+ * Used to iterate through ranges of set bits.
+ *
+ * Tries to find the next contiguous sequence of set bits with a first index >=
+ * start.  If one exists, puts the earliest bit of the range in *r_begin, its
+ * length in *r_len, and returns true.  Otherwise, returns false (without
+ * touching *r_begin or *r_end).
+ */
+static inline bool
+fb_srange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ true, /* forward */ true);
+}
+
+/*
+ * The same as fb_srange_iter, but searches backwards from start rather than
+ * forwards.  (The position returned is still the earliest bit in the range).
+ */
+static inline bool
+fb_srange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ true, /* forward */ false);
+}
+
+/* Similar to fb_srange_iter, but searches for unset bits. */
+static inline bool
+fb_urange_iter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ false, /* forward */ true);
+}
+
+/* Similar to fb_srange_riter, but searches for unset bits. */
+static inline bool
+fb_urange_riter(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
+    size_t *r_len) {
+	return fb_iter_range_impl(fb, nbits, start, r_begin, r_len,
+	    /* val */ false, /* forward */ false);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+fb_range_longest_impl(fb_group_t *fb, size_t nbits, bool val) {
+	size_t begin = 0;
+	size_t longest_len = 0;
+	size_t len = 0;
+	while (begin < nbits
+	    && fb_iter_range_impl(
+	        fb, nbits, begin, &begin, &len, val, /* forward */ true)) {
+		if (len > longest_len) {
+			longest_len = len;
+		}
+		begin += len;
+	}
+	return longest_len;
+}
+
+static inline size_t
+fb_srange_longest(fb_group_t *fb, size_t nbits) {
+	return fb_range_longest_impl(fb, nbits, /* val */ true);
+}
+
+static inline size_t
+fb_urange_longest(fb_group_t *fb, size_t nbits) {
+	return fb_range_longest_impl(fb, nbits, /* val */ false);
+}
+
+/*
+ * Initializes each bit of dst with the bitwise-AND of the corresponding bits of
+ * src1 and src2.  All bitmaps must be the same size.
+ */
+static inline void
+fb_bit_and(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = src1[i] & src2[i];
+	}
+}
+
+/* Like fb_bit_and, but with bitwise-OR. */
+static inline void
+fb_bit_or(fb_group_t *dst, fb_group_t *src1, fb_group_t *src2, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = src1[i] | src2[i];
+	}
+}
+
+/* Initializes dst bit i to the negation of source bit i. */
+static inline void
+fb_bit_not(fb_group_t *dst, fb_group_t *src, size_t nbits) {
+	size_t ngroups = FB_NGROUPS(nbits);
+	for (size_t i = 0; i < ngroups; i++) {
+		dst[i] = ~src[i];
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_FB_H */
--- a/include/jemalloc/internal/fxp.h
+++ b/include/jemalloc/internal/fxp.h
@ -0,0 +1,129 @@
+#ifndef JEMALLOC_INTERNAL_FXP_H
+#define JEMALLOC_INTERNAL_FXP_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
+/*
+ * A simple fixed-point math implementation, supporting only unsigned values
+ * (with overflow being an error).
+ *
+ * It's not in general safe to use floating point in core code, because various
+ * libc implementations we get linked against can assume that malloc won't touch
+ * floating point state and call it with an unusual calling convention.
+ */
+
+/*
+ * High 16 bits are the integer part, low 16 are the fractional part.  Or
+ * equivalently, repr == 2**16 * val, where we use "val" to refer to the
+ * (imaginary) fractional representation of the true value.
+ *
+ * We pick a uint32_t here since it's convenient in some places to
+ * double the representation size (i.e. multiplication and division use
+ * 64-bit integer types), and a uint64_t is the largest type we're
+ * certain is available.
+ */
+typedef uint32_t fxp_t;
+#define FXP_INIT_INT(x) ((x) << 16)
+#define FXP_INIT_PERCENT(pct) (((pct) << 16) / 100)
+
+/*
+ * Amount of precision used in parsing and printing numbers.  The integer bound
+ * is simply because the integer part of the number gets 16 bits, and so is
+ * bounded by 65536.
+ *
+ * We use a lot of precision for the fractional part, even though most of it
+ * gets rounded off; this lets us get exact values for the important special
+ * case where the denominator is a small power of 2 (for instance,
+ * 1/512 == 0.001953125 is exactly representable even with only 16 bits of
+ * fractional precision).  We need to left-shift by 16 before dividing by
+ * 10**precision, so we pick precision to be floor(log(2**48)) = 14.
+ */
+#define FXP_INTEGER_PART_DIGITS 5
+#define FXP_FRACTIONAL_PART_DIGITS 14
+
+/*
+ * In addition to the integer and fractional parts of the number, we need to
+ * include a null character and (possibly) a decimal point.
+ */
+#define FXP_BUF_SIZE (FXP_INTEGER_PART_DIGITS + FXP_FRACTIONAL_PART_DIGITS + 2)
+
+static inline fxp_t
+fxp_add(fxp_t a, fxp_t b) {
+	return a + b;
+}
+
+static inline fxp_t
+fxp_sub(fxp_t a, fxp_t b) {
+	assert(a >= b);
+	return a - b;
+}
+
+static inline fxp_t
+fxp_mul(fxp_t a, fxp_t b) {
+	uint64_t unshifted = (uint64_t)a * (uint64_t)b;
+	/*
+	 * Unshifted is (a.val * 2**16) * (b.val * 2**16)
+	 *   == (a.val * b.val) * 2**32, but we want
+	 * (a.val * b.val) * 2 ** 16.
+	 */
+	return (uint32_t)(unshifted >> 16);
+}
+
+static inline fxp_t
+fxp_div(fxp_t a, fxp_t b) {
+	assert(b != 0);
+	uint64_t unshifted = ((uint64_t)a << 32) / (uint64_t)b;
+	/*
+	 * Unshifted is (a.val * 2**16) * (2**32) / (b.val * 2**16)
+	 *   == (a.val / b.val) * (2 ** 32), which again corresponds to a right
+	 *   shift of 16.
+	 */
+	return (uint32_t)(unshifted >> 16);
+}
+
+static inline uint32_t
+fxp_round_down(fxp_t a) {
+	return a >> 16;
+}
+
+static inline uint32_t
+fxp_round_nearest(fxp_t a) {
+	uint32_t fractional_part = (a & ((1U << 16) - 1));
+	uint32_t increment = (uint32_t)(fractional_part >= (1U << 15));
+	return (a >> 16) + increment;
+}
+
+/*
+ * Approximately computes x * frac, without the size limitations that would be
+ * imposed by converting u to an fxp_t.
+ */
+static inline size_t
+fxp_mul_frac(size_t x_orig, fxp_t frac) {
+	assert(frac <= (1U << 16));
+	/*
+	 * Work around an over-enthusiastic warning about type limits below (on
+	 * 32-bit platforms, a size_t is always less than 1ULL << 48).
+	 */
+	uint64_t x = (uint64_t)x_orig;
+	/*
+	 * If we can guarantee no overflow, multiply first before shifting, to
+	 * preserve some precision.  Otherwise, shift first and then multiply.
+	 * In the latter case, we only lose the low 16 bits of a 48-bit number,
+	 * so we're still accurate to within 1/2**32.
+	 */
+	if (x < (1ULL << 48)) {
+		return (size_t)((x * frac) >> 16);
+	} else {
+		return (size_t)((x >> 16) * (uint64_t)frac);
+	}
+}
+
+/*
+ * Returns true on error.  Otherwise, returns false and updates *ptr to point to
+ * the first character not parsed (because it wasn't a digit).
+ */
+bool fxp_parse(fxp_t *a, const char *ptr, char **end);
+void fxp_print(fxp_t a, char buf[FXP_BUF_SIZE]);
+
+#endif /* JEMALLOC_INTERNAL_FXP_H */
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@ -1,111 +1,79 @@
+#ifndef JEMALLOC_INTERNAL_HASH_H
+#define JEMALLOC_INTERNAL_HASH_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
 /*
 * The following hash function is based on MurmurHash3, placed into the public
 * domain by Austin Appleby.  See https://github.com/aappleby/smhasher for
 * details.
 */
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES

-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#ifndef JEMALLOC_ENABLE_INLINE
-uint32_t	hash_x86_32(const void *key, int len, uint32_t seed);
-void	hash_x86_128(const void *key, const int len, uint32_t seed,
-    uint64_t r_out[2]);
-void	hash_x64_128(const void *key, const int len, const uint32_t seed,
-    uint64_t r_out[2]);
-void	hash(const void *key, size_t len, const uint32_t seed,
-    size_t r_hash[2]);
-#endif
-
-#if (defined(JEMALLOC_ENABLE_INLINE) || defined(JEMALLOC_HASH_C_))
 /******************************************************************************/
 /* Internal implementation. */
-JEMALLOC_INLINE uint32_t
-hash_rotl_32(uint32_t x, int8_t r)
-{
-
+static inline uint32_t
+hash_rotl_32(uint32_t x, int8_t r) {
 	return ((x << r) | (x >> (32 - r)));
 }

-JEMALLOC_INLINE uint64_t
-hash_rotl_64(uint64_t x, int8_t r)
-{
-
+static inline uint64_t
+hash_rotl_64(uint64_t x, int8_t r) {
 	return ((x << r) | (x >> (64 - r)));
 }

-JEMALLOC_INLINE uint32_t
-hash_get_block_32(const uint32_t *p, int i)
-{
-
+static inline uint32_t
+hash_get_block_32(const uint32_t *p, int i) {
 	/* Handle unaligned read. */
-	if (unlikely((uintptr_t)p & (sizeof(uint32_t)-1)) != 0) {
+	if (unlikely((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0) {
 		uint32_t ret;

 		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint32_t));
-		return (ret);
+		return ret;
 	}

-	return (p[i]);
+	return p[i];
 }

-JEMALLOC_INLINE uint64_t
-hash_get_block_64(const uint64_t *p, int i)
-{
-
+static inline uint64_t
+hash_get_block_64(const uint64_t *p, int i) {
 	/* Handle unaligned read. */
-	if (unlikely((uintptr_t)p & (sizeof(uint64_t)-1)) != 0) {
+	if (unlikely((uintptr_t)p & (sizeof(uint64_t) - 1)) != 0) {
 		uint64_t ret;

 		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint64_t));
-		return (ret);
+		return ret;
 	}

-	return (p[i]);
+	return p[i];
 }

-JEMALLOC_INLINE uint32_t
-hash_fmix_32(uint32_t h)
-{
-
+static inline uint32_t
+hash_fmix_32(uint32_t h) {
 	h ^= h >> 16;
 	h *= 0x85ebca6b;
 	h ^= h >> 13;
 	h *= 0xc2b2ae35;
 	h ^= h >> 16;

-	return (h);
+	return h;
 }

-JEMALLOC_INLINE uint64_t
-hash_fmix_64(uint64_t k)
-{
-
+static inline uint64_t
+hash_fmix_64(uint64_t k) {
 	k ^= k >> 33;
 	k *= KQU(0xff51afd7ed558ccd);
 	k ^= k >> 33;
 	k *= KQU(0xc4ceb9fe1a85ec53);
 	k ^= k >> 33;

-	return (k);
+	return k;
 }

-JEMALLOC_INLINE uint32_t
-hash_x86_32(const void *key, int len, uint32_t seed)
-{
-	const uint8_t *data = (const uint8_t *) key;
-	const int nblocks = len / 4;
+static inline uint32_t
+hash_x86_32(const void *key, int len, uint32_t seed) {
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 4;

 	uint32_t h1 = seed;

@ -114,8 +82,8 @@ hash_x86_32(const void *key, int len, uint32_t seed)

 	/* body */
 	{
-		const uint32_t *blocks = (const uint32_t *) (data + nblocks*4);
-		int i;
+		const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4);
+		int             i;

 		for (i = -nblocks; i; i++) {
 			uint32_t k1 = hash_get_block_32(blocks, i);
@ -126,21 +94,29 @@ hash_x86_32(const void *key, int len, uint32_t seed)

 			h1 ^= k1;
 			h1 = hash_rotl_32(h1, 13);
-			h1 = h1*5 + 0xe6546b64;
+			h1 = h1 * 5 + 0xe6546b64;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t *) (data + nblocks*4);
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 4);

 		uint32_t k1 = 0;

 		switch (len & 3) {
-		case 3: k1 ^= tail[2] << 16;
-		case 2: k1 ^= tail[1] << 8;
-		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
-			k1 *= c2; h1 ^= k1;
+		case 3:
+			k1 ^= tail[2] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= tail[1] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= tail[0];
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;
 		}
 	}

@ -149,15 +125,13 @@ hash_x86_32(const void *key, int len, uint32_t seed)

 	h1 = hash_fmix_32(h1);

-	return (h1);
+	return h1;
 }

-UNUSED JEMALLOC_INLINE void
-hash_x86_128(const void *key, const int len, uint32_t seed,
-    uint64_t r_out[2])
-{
-	const uint8_t * data = (const uint8_t *) key;
-	const int nblocks = len / 16;
+static inline void
+hash_x86_128(const void *key, const int len, uint32_t seed, uint64_t r_out[2]) {
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 16;

 	uint32_t h1 = seed;
 	uint32_t h2 = seed;
@ -171,95 +145,161 @@ hash_x86_128(const void *key, const int len, uint32_t seed,

 	/* body */
 	{
-		const uint32_t *blocks = (const uint32_t *) (data + nblocks*16);
-		int i;
+		const uint32_t *blocks = (const uint32_t *)(data
+		    + nblocks * 16);
+		int             i;

 		for (i = -nblocks; i; i++) {
-			uint32_t k1 = hash_get_block_32(blocks, i*4 + 0);
-			uint32_t k2 = hash_get_block_32(blocks, i*4 + 1);
-			uint32_t k3 = hash_get_block_32(blocks, i*4 + 2);
-			uint32_t k4 = hash_get_block_32(blocks, i*4 + 3);
+			uint32_t k1 = hash_get_block_32(blocks, i * 4 + 0);
+			uint32_t k2 = hash_get_block_32(blocks, i * 4 + 1);
+			uint32_t k3 = hash_get_block_32(blocks, i * 4 + 2);
+			uint32_t k4 = hash_get_block_32(blocks, i * 4 + 3);

-			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;

-			h1 = hash_rotl_32(h1, 19); h1 += h2;
-			h1 = h1*5 + 0x561ccd1b;
+			h1 = hash_rotl_32(h1, 19);
+			h1 += h2;
+			h1 = h1 * 5 + 0x561ccd1b;

-			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
+			k2 *= c2;
+			k2 = hash_rotl_32(k2, 16);
+			k2 *= c3;
+			h2 ^= k2;

-			h2 = hash_rotl_32(h2, 17); h2 += h3;
-			h2 = h2*5 + 0x0bcaa747;
+			h2 = hash_rotl_32(h2, 17);
+			h2 += h3;
+			h2 = h2 * 5 + 0x0bcaa747;

-			k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
+			k3 *= c3;
+			k3 = hash_rotl_32(k3, 17);
+			k3 *= c4;
+			h3 ^= k3;

-			h3 = hash_rotl_32(h3, 15); h3 += h4;
-			h3 = h3*5 + 0x96cd1c35;
+			h3 = hash_rotl_32(h3, 15);
+			h3 += h4;
+			h3 = h3 * 5 + 0x96cd1c35;

-			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
+			k4 *= c4;
+			k4 = hash_rotl_32(k4, 18);
+			k4 *= c1;
+			h4 ^= k4;

-			h4 = hash_rotl_32(h4, 13); h4 += h1;
-			h4 = h4*5 + 0x32ac3b17;
+			h4 = hash_rotl_32(h4, 13);
+			h4 += h1;
+			h4 = h4 * 5 + 0x32ac3b17;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t *) (data + nblocks*16);
-		uint32_t k1 = 0;
-		uint32_t k2 = 0;
-		uint32_t k3 = 0;
-		uint32_t k4 = 0;
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
+		uint32_t       k1 = 0;
+		uint32_t       k2 = 0;
+		uint32_t       k3 = 0;
+		uint32_t       k4 = 0;

 		switch (len & 15) {
-		case 15: k4 ^= tail[14] << 16;
-		case 14: k4 ^= tail[13] << 8;
-		case 13: k4 ^= tail[12] << 0;
-			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
-
-		case 12: k3 ^= tail[11] << 24;
-		case 11: k3 ^= tail[10] << 16;
-		case 10: k3 ^= tail[ 9] << 8;
-		case  9: k3 ^= tail[ 8] << 0;
-		     k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
-
-		case  8: k2 ^= tail[ 7] << 24;
-		case  7: k2 ^= tail[ 6] << 16;
-		case  6: k2 ^= tail[ 5] << 8;
-		case  5: k2 ^= tail[ 4] << 0;
-			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
-
-		case  4: k1 ^= tail[ 3] << 24;
-		case  3: k1 ^= tail[ 2] << 16;
-		case  2: k1 ^= tail[ 1] << 8;
-		case  1: k1 ^= tail[ 0] << 0;
-			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+		case 15:
+			k4 ^= tail[14] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 14:
+			k4 ^= tail[13] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 13:
+			k4 ^= tail[12] << 0;
+			k4 *= c4;
+			k4 = hash_rotl_32(k4, 18);
+			k4 *= c1;
+			h4 ^= k4;
+			JEMALLOC_FALLTHROUGH;
+		case 12:
+			k3 ^= (uint32_t)tail[11] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 11:
+			k3 ^= tail[10] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 10:
+			k3 ^= tail[9] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 9:
+			k3 ^= tail[8] << 0;
+			k3 *= c3;
+			k3 = hash_rotl_32(k3, 17);
+			k3 *= c4;
+			h3 ^= k3;
+			JEMALLOC_FALLTHROUGH;
+		case 8:
+			k2 ^= (uint32_t)tail[7] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 7:
+			k2 ^= tail[6] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 6:
+			k2 ^= tail[5] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 5:
+			k2 ^= tail[4] << 0;
+			k2 *= c2;
+			k2 = hash_rotl_32(k2, 16);
+			k2 *= c3;
+			h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case 4:
+			k1 ^= (uint32_t)tail[3] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 3:
+			k1 ^= tail[2] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= tail[1] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= tail[0] << 0;
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;
+			break;
 		}
 	}

 	/* finalization */
-	h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+	h1 ^= len;
+	h2 ^= len;
+	h3 ^= len;
+	h4 ^= len;

-	h1 += h2; h1 += h3; h1 += h4;
-	h2 += h1; h3 += h1; h4 += h1;
+	h1 += h2;
+	h1 += h3;
+	h1 += h4;
+	h2 += h1;
+	h3 += h1;
+	h4 += h1;

 	h1 = hash_fmix_32(h1);
 	h2 = hash_fmix_32(h2);
 	h3 = hash_fmix_32(h3);
 	h4 = hash_fmix_32(h4);

-	h1 += h2; h1 += h3; h1 += h4;
-	h2 += h1; h3 += h1; h4 += h1;
+	h1 += h2;
+	h1 += h3;
+	h1 += h4;
+	h2 += h1;
+	h3 += h1;
+	h4 += h1;

-	r_out[0] = (((uint64_t) h2) << 32) | h1;
-	r_out[1] = (((uint64_t) h4) << 32) | h3;
+	r_out[0] = (((uint64_t)h2) << 32) | h1;
+	r_out[1] = (((uint64_t)h4) << 32) | h3;
 }

-UNUSED JEMALLOC_INLINE void
-hash_x64_128(const void *key, const int len, const uint32_t seed,
-    uint64_t r_out[2])
-{
-	const uint8_t *data = (const uint8_t *) key;
-	const int nblocks = len / 16;
+static inline void
+hash_x64_128(
+    const void *key, const int len, const uint32_t seed, uint64_t r_out[2]) {
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 16;

 	uint64_t h1 = seed;
 	uint64_t h2 = seed;
@ -269,55 +309,99 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,

 	/* body */
 	{
-		const uint64_t *blocks = (const uint64_t *) (data);
-		int i;
+		const uint64_t *blocks = (const uint64_t *)(data);
+		int             i;

 		for (i = 0; i < nblocks; i++) {
-			uint64_t k1 = hash_get_block_64(blocks, i*2 + 0);
-			uint64_t k2 = hash_get_block_64(blocks, i*2 + 1);
+			uint64_t k1 = hash_get_block_64(blocks, i * 2 + 0);
+			uint64_t k2 = hash_get_block_64(blocks, i * 2 + 1);

-			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+			k1 *= c1;
+			k1 = hash_rotl_64(k1, 31);
+			k1 *= c2;
+			h1 ^= k1;

-			h1 = hash_rotl_64(h1, 27); h1 += h2;
-			h1 = h1*5 + 0x52dce729;
+			h1 = hash_rotl_64(h1, 27);
+			h1 += h2;
+			h1 = h1 * 5 + 0x52dce729;

-			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
+			k2 *= c2;
+			k2 = hash_rotl_64(k2, 33);
+			k2 *= c1;
+			h2 ^= k2;

-			h2 = hash_rotl_64(h2, 31); h2 += h1;
-			h2 = h2*5 + 0x38495ab5;
+			h2 = hash_rotl_64(h2, 31);
+			h2 += h1;
+			h2 = h2 * 5 + 0x38495ab5;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t*)(data + nblocks*16);
-		uint64_t k1 = 0;
-		uint64_t k2 = 0;
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
+		uint64_t       k1 = 0;
+		uint64_t       k2 = 0;

 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48;
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40;
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32;
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24;
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16;
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;
-		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
-			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
-
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56;
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48;
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40;
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32;
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24;
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16;
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;
-		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
-			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+		case 15:
+			k2 ^= ((uint64_t)(tail[14])) << 48;
+			JEMALLOC_FALLTHROUGH;
+		case 14:
+			k2 ^= ((uint64_t)(tail[13])) << 40;
+			JEMALLOC_FALLTHROUGH;
+		case 13:
+			k2 ^= ((uint64_t)(tail[12])) << 32;
+			JEMALLOC_FALLTHROUGH;
+		case 12:
+			k2 ^= ((uint64_t)(tail[11])) << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 11:
+			k2 ^= ((uint64_t)(tail[10])) << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 10:
+			k2 ^= ((uint64_t)(tail[9])) << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 9:
+			k2 ^= ((uint64_t)(tail[8])) << 0;
+			k2 *= c2;
+			k2 = hash_rotl_64(k2, 33);
+			k2 *= c1;
+			h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case 8:
+			k1 ^= ((uint64_t)(tail[7])) << 56;
+			JEMALLOC_FALLTHROUGH;
+		case 7:
+			k1 ^= ((uint64_t)(tail[6])) << 48;
+			JEMALLOC_FALLTHROUGH;
+		case 6:
+			k1 ^= ((uint64_t)(tail[5])) << 40;
+			JEMALLOC_FALLTHROUGH;
+		case 5:
+			k1 ^= ((uint64_t)(tail[4])) << 32;
+			JEMALLOC_FALLTHROUGH;
+		case 4:
+			k1 ^= ((uint64_t)(tail[3])) << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 3:
+			k1 ^= ((uint64_t)(tail[2])) << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= ((uint64_t)(tail[1])) << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= ((uint64_t)(tail[0])) << 0;
+			k1 *= c1;
+			k1 = hash_rotl_64(k1, 31);
+			k1 *= c2;
+			h1 ^= k1;
+			break;
 		}
 	}

 	/* finalization */
-	h1 ^= len; h2 ^= len;
+	h1 ^= len;
+	h2 ^= len;

 	h1 += h2;
 	h2 += h1;
@ -334,10 +418,8 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,

 /******************************************************************************/
 /* API. */
-JEMALLOC_INLINE void
-hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2])
-{
-
+static inline void
+hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2]) {
 	assert(len <= INT_MAX); /* Unfortunate implementation limitation. */

 #if (LG_SIZEOF_PTR == 3 && !defined(JEMALLOC_BIG_ENDIAN))
@ -351,7 +433,5 @@ hash(const void *key, size_t len, const uint32_t seed, size_t r_hash[2])
 	}
 #endif
 }
-#endif

-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
+#endif /* JEMALLOC_INTERNAL_HASH_H */
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@ -0,0 +1,163 @@
+#ifndef JEMALLOC_INTERNAL_HOOK_H
+#define JEMALLOC_INTERNAL_HOOK_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd.h"
+
+/*
+ * This API is *extremely* experimental, and may get ripped out, changed in API-
+ * and ABI-incompatible ways, be insufficiently or incorrectly documented, etc.
+ *
+ * It allows hooking the stateful parts of the API to see changes as they
+ * happen.
+ *
+ * Allocation hooks are called after the allocation is done, free hooks are
+ * called before the free is done, and expand hooks are called after the
+ * allocation is expanded.
+ *
+ * For realloc and rallocx, if the expansion happens in place, the expansion
+ * hook is called.  If it is moved, then the alloc hook is called on the new
+ * location, and then the free hook is called on the old location (i.e. both
+ * hooks are invoked in between the alloc and the dalloc).
+ *
+ * If we return NULL from OOM, then usize might not be trustworthy.  Calling
+ * realloc(NULL, size) only calls the alloc hook, and calling realloc(ptr, 0)
+ * only calls the free hook.  (Calling realloc(NULL, 0) is treated as malloc(0),
+ * and only calls the alloc hook).
+ *
+ * Reentrancy:
+ *   Reentrancy is guarded against from within the hook implementation.  If you
+ *   call allocator functions from within a hook, the hooks will not be invoked
+ *   again.
+ * Threading:
+ *   The installation of a hook synchronizes with all its uses.  If you can
+ *   prove the installation of a hook happens-before a jemalloc entry point,
+ *   then the hook will get invoked (unless there's a racing removal).
+ *
+ *   Hook insertion appears to be atomic at a per-thread level (i.e. if a thread
+ *   allocates and has the alloc hook invoked, then a subsequent free on the
+ *   same thread will also have the free hook invoked).
+ *
+ *   The *removal* of a hook does *not* block until all threads are done with
+ *   the hook.  Hook authors have to be resilient to this, and need some
+ *   out-of-band mechanism for cleaning up any dynamically allocated memory
+ *   associated with their hook.
+ * Ordering:
+ *   Order of hook execution is unspecified, and may be different than insertion
+ *   order.
+ */
+
+#define HOOK_MAX 4
+
+enum hook_alloc_e {
+	hook_alloc_malloc,
+	hook_alloc_posix_memalign,
+	hook_alloc_aligned_alloc,
+	hook_alloc_calloc,
+	hook_alloc_memalign,
+	hook_alloc_valloc,
+	hook_alloc_pvalloc,
+	hook_alloc_mallocx,
+
+	/* The reallocating functions have both alloc and dalloc variants */
+	hook_alloc_realloc,
+	hook_alloc_rallocx,
+};
+/*
+ * We put the enum typedef after the enum, since this file may get included by
+ * jemalloc_cpp.cpp, and C++ disallows enum forward declarations.
+ */
+typedef enum hook_alloc_e hook_alloc_t;
+
+enum hook_dalloc_e {
+	hook_dalloc_free,
+	hook_dalloc_dallocx,
+	hook_dalloc_sdallocx,
+
+	/*
+	 * The dalloc halves of reallocation (not called if in-place expansion
+	 * happens).
+	 */
+	hook_dalloc_realloc,
+	hook_dalloc_rallocx,
+};
+typedef enum hook_dalloc_e hook_dalloc_t;
+
+enum hook_expand_e {
+	hook_expand_realloc,
+	hook_expand_rallocx,
+	hook_expand_xallocx,
+};
+typedef enum hook_expand_e hook_expand_t;
+
+typedef void (*hook_alloc)(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]);
+
+typedef void (*hook_dalloc)(
+    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+typedef void (*hook_expand)(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]);
+
+typedef struct hooks_s hooks_t;
+struct hooks_s {
+	hook_alloc  alloc_hook;
+	hook_dalloc dalloc_hook;
+	hook_expand expand_hook;
+	void       *extra;
+};
+
+/*
+ * Begin implementation details; everything above this point might one day live
+ * in a public API.  Everything below this point never will.
+ */
+
+/*
+ * The realloc pathways haven't gotten any refactoring love in a while, and it's
+ * fairly difficult to pass information from the entry point to the hooks.  We
+ * put the informaiton the hooks will need into a struct to encapsulate
+ * everything.
+ *
+ * Much of these pathways are force-inlined, so that the compiler can avoid
+ * materializing this struct until we hit an extern arena function.  For fairly
+ * goofy reasons, *many* of the realloc paths hit an extern arena function.
+ * These paths are cold enough that it doesn't matter; eventually, we should
+ * rewrite the realloc code to make the expand-in-place and the
+ * free-then-realloc paths more orthogonal, at which point we don't need to
+ * spread the hook logic all over the place.
+ */
+typedef struct hook_ralloc_args_s hook_ralloc_args_t;
+struct hook_ralloc_args_s {
+	/* I.e. as opposed to rallocx. */
+	bool is_realloc;
+	/*
+	 * The expand hook takes 4 arguments, even if only 3 are actually used;
+	 * we add an extra one in case the user decides to memcpy without
+	 * looking too closely at the hooked function.
+	 */
+	uintptr_t args[4];
+};
+
+/*
+ * Returns an opaque handle to be used when removing the hook.  NULL means that
+ * we couldn't install the hook.
+ */
+bool hook_boot(void);
+
+void *hook_install(tsdn_t *tsdn, hooks_t *to_install);
+/* Uninstalls the hook with the handle previously returned from hook_install. */
+void hook_remove(tsdn_t *tsdn, void *opaque);
+
+/* Hooks */
+
+void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
+    uintptr_t args_raw[3]);
+
+void hook_invoke_dalloc(
+    hook_dalloc_t type, void *address, uintptr_t args_raw[3]);
+
+void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
+    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+
+#endif /* JEMALLOC_INTERNAL_HOOK_H */
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -0,0 +1,185 @@
+#ifndef JEMALLOC_INTERNAL_HPA_H
+#define JEMALLOC_INTERNAL_HPA_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/exp_grow.h"
+#include "jemalloc/internal/hpa_central.h"
+#include "jemalloc/internal/hpa_hooks.h"
+#include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/pai.h"
+#include "jemalloc/internal/psset.h"
+#include "jemalloc/internal/sec.h"
+
+typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
+struct hpa_shard_nonderived_stats_s {
+	/*
+	 * The number of times we've purged within a hugepage.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t npurge_passes;
+	/*
+	 * The number of individual purge calls we perform (which should always
+	 * be bigger than npurge_passes, since each pass purges at least one
+	 * extent within a hugepage.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t npurges;
+
+	/*
+	 * The number of times we've hugified a pageslab.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t nhugifies;
+
+	/*
+	 * The number of times we've tried to hugify a pageslab, but failed.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t nhugify_failures;
+
+	/*
+	 * The number of times we've dehugified a pageslab.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t ndehugifies;
+};
+
+/* Completely derived; only used by CTL. */
+typedef struct hpa_shard_stats_s hpa_shard_stats_t;
+struct hpa_shard_stats_s {
+	psset_stats_t                psset_stats;
+	hpa_shard_nonderived_stats_t nonderived_stats;
+	sec_stats_t                  secstats;
+};
+
+typedef struct hpa_shard_s hpa_shard_t;
+struct hpa_shard_s {
+	/*
+	 * pai must be the first member; we cast from a pointer to it to a
+	 * pointer to the hpa_shard_t.
+	 */
+	pai_t pai;
+
+	/* The central allocator we get our hugepages from. */
+	hpa_central_t *central;
+
+	/* Protects most of this shard's state. */
+	malloc_mutex_t mtx;
+
+	/*
+	 * Guards the shard's access to the central allocator (preventing
+	 * multiple threads operating on this shard from accessing the central
+	 * allocator).
+	 */
+	malloc_mutex_t grow_mtx;
+
+	/* The base metadata allocator. */
+	base_t *base;
+
+	/*
+	 * This edata cache is the one we use when allocating a small extent
+	 * from a pageslab.  The pageslab itself comes from the centralized
+	 * allocator, and so will use its edata_cache.
+	 */
+	edata_cache_fast_t ecf;
+
+	/* Small extent cache (not guarded by mtx) */
+	JEMALLOC_ALIGNED(CACHELINE) sec_t sec;
+
+	psset_t psset;
+
+	/*
+	 * How many grow operations have occurred.
+	 *
+	 * Guarded by grow_mtx.
+	 */
+	uint64_t age_counter;
+
+	/* The arena ind we're associated with. */
+	unsigned ind;
+
+	/*
+	 * Our emap.  This is just a cache of the emap pointer in the associated
+	 * hpa_central.
+	 */
+	emap_t *emap;
+
+	/* The configuration choices for this hpa shard. */
+	hpa_shard_opts_t opts;
+
+	/*
+	 * How many pages have we started but not yet finished purging in this
+	 * hpa shard.
+	 */
+	size_t npending_purge;
+
+	/*
+	 * Those stats which are copied directly into the CTL-centric hpa shard
+	 * stats.
+	 */
+	hpa_shard_nonderived_stats_t stats;
+
+	/*
+	 * Last time we performed purge on this shard.
+	 */
+	nstime_t last_purge;
+
+	/*
+	 * Last time when we attempted work (purging or hugifying). If deferral
+	 * of the work is allowed (we have background thread), this is the time
+	 * when background thread checked if purging or hugifying needs to be
+	 * done. If deferral is not allowed, this is the time of (hpa_alloc or
+	 * hpa_dalloc) activity in the shard.
+	 */
+	nstime_t last_time_work_attempted;
+};
+
+bool hpa_hugepage_size_exceeds_limit(void);
+/*
+ * Whether or not the HPA can be used given the current configuration.  This is
+ * is not necessarily a guarantee that it backs its allocations by hugepages,
+ * just that it can function properly given the system it's running on.
+ */
+bool hpa_supported(void);
+bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
+    emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
+    const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);
+
+void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
+void hpa_shard_stats_merge(
+    tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst);
+
+/*
+ * Notify the shard that we won't use it for allocations much longer.  Due to
+ * the possibility of races, we don't actually prevent allocations; just flush
+ * and disable the embedded edata_cache_small.
+ */
+void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
+/* Flush caches that shard may be using */
+void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);
+
+void hpa_shard_set_deferral_allowed(
+    tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
+void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);
+
+/*
+ * We share the fork ordering with the PA and arena prefork handling; that's why
+ * these are 2, 3 and 4 rather than 0 and 1.
+ */
+void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
+void hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard);
+
+#endif /* JEMALLOC_INTERNAL_HPA_H */
--- a/include/jemalloc/internal/hpa_central.h
+++ b/include/jemalloc/internal/hpa_central.h
@ -0,0 +1,41 @@
+#ifndef JEMALLOC_INTERNAL_HPA_CENTRAL_H
+#define JEMALLOC_INTERNAL_HPA_CENTRAL_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/hpa_hooks.h"
+#include "jemalloc/internal/hpdata.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd_types.h"
+
+typedef struct hpa_central_s hpa_central_t;
+struct hpa_central_s {
+	/*
+	 * Guards expansion of eden.  We separate this from the regular mutex so
+	 * that cheaper operations can still continue while we're doing the OS
+	 * call.
+	 */
+	malloc_mutex_t grow_mtx;
+	/*
+	 * Either NULL (if empty), or some integer multiple of a
+	 * hugepage-aligned number of hugepages.  We carve them off one at a
+	 * time to satisfy new pageslab requests.
+	 *
+	 * Guarded by grow_mtx.
+	 */
+	void  *eden;
+	size_t eden_len;
+	/* Source for metadata. */
+	base_t *base;
+
+	/* The HPA hooks. */
+	hpa_hooks_t hooks;
+};
+
+bool hpa_central_init(
+    hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
+
+hpdata_t *hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
+    uint64_t age, bool hugify_eager, bool *oom);
+
+#endif /* JEMALLOC_INTERNAL_HPA_CENTRAL_H */
--- a/include/jemalloc/internal/hpa_hooks.h
+++ b/include/jemalloc/internal/hpa_hooks.h
@ -0,0 +1,21 @@
+#ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H
+#define JEMALLOC_INTERNAL_HPA_HOOKS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/nstime.h"
+
+typedef struct hpa_hooks_s hpa_hooks_t;
+struct hpa_hooks_s {
+	void *(*map)(size_t size);
+	void (*unmap)(void *ptr, size_t size);
+	void (*purge)(void *ptr, size_t size);
+	bool (*hugify)(void *ptr, size_t size, bool sync);
+	void (*dehugify)(void *ptr, size_t size);
+	void (*curtime)(nstime_t *r_time, bool first_reading);
+	uint64_t (*ms_since)(nstime_t *r_time);
+	bool (*vectorized_purge)(void *vec, size_t vlen, size_t nbytes);
+};
+
+extern const hpa_hooks_t hpa_hooks_default;
+
+#endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -0,0 +1,190 @@
+#ifndef JEMALLOC_INTERNAL_HPA_OPTS_H
+#define JEMALLOC_INTERNAL_HPA_OPTS_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/fxp.h"
+
+/*
+ * This file is morally part of hpa.h, but is split out for header-ordering
+ * reasons.
+ *
+ * All of these hpa_shard_opts below are experimental. We are exploring more
+ * efficient packing, hugifying, and purging approaches to make efficient
+ * trade-offs between CPU, memory, latency, and usability. This means all of
+ * them are at the risk of being deprecated and corresponding configurations
+ * should be updated once the final version settles.
+ */
+
+/*
+ * This enum controls how jemalloc hugifies/dehugifies pages.  Each style may be
+ * more suitable depending on deployment environments.
+ *
+ * hpa_hugify_style_none
+ * Using this means that jemalloc will not be hugifying or dehugifying pages,
+ * but will let the kernel make those decisions.  This style only makes sense
+ * when deploying on systems where THP are enabled in 'always' mode.  With this
+ * style, you most likely want to have no purging at all (dirty_mult=-1) or
+ * purge_threshold=HUGEPAGE bytes (2097152 for 2Mb page), although other
+ * thresholds may work well depending on kernel settings of your deployment
+ * targets.
+ *
+ * hpa_hugify_style_eager
+ * This style results in jemalloc giving hugepage advice, if needed, to
+ * anonymous memory immediately after it is mapped, so huge pages can be backing
+ * that memory at page-fault time.  This is usually more efficient than doing
+ * it later, and it allows us to benefit from the hugepages from the start.
+ * Same options for purging as for the style 'none' are good starting choices:
+ * no purging, or purge_threshold=HUGEPAGE, some min_purge_delay_ms that allows
+ * for page not to be purged quickly, etc.  This is a good choice if you can
+ * afford extra memory and your application gets performance increase from
+ * transparent hughepages.
+ *
+ * hpa_hugify_style_lazy
+ * This style is suitable when you purge more aggressively (you sacrifice CPU
+ * performance for less memory).  When this style is chosen, jemalloc will
+ * hugify once hugification_threshold is reached, and dehugify before purging.
+ * If the kernel is configured to use direct compaction you may experience some
+ * allocation latency when using this style.  The best is to measure what works
+ * better for your application needs, and in the target deployment environment.
+ * This is a good choice for apps that cannot afford a lot of memory regression,
+ * but would still like to benefit from backing certain memory regions with
+ * hugepages.
+ */
+enum hpa_hugify_style_e {
+	hpa_hugify_style_auto = 0,
+	hpa_hugify_style_none = 1,
+	hpa_hugify_style_eager = 2,
+	hpa_hugify_style_lazy = 3,
+	hpa_hugify_style_limit = hpa_hugify_style_lazy + 1
+};
+typedef enum hpa_hugify_style_e hpa_hugify_style_t;
+
+extern const char *const hpa_hugify_style_names[];
+
+typedef struct hpa_shard_opts_s hpa_shard_opts_t;
+struct hpa_shard_opts_s {
+	/*
+	 * The largest size we'll allocate out of the shard.  For those
+	 * allocations refused, the caller (in practice, the PA module) will
+	 * fall back to the more general (for now) PAC, which can always handle
+	 * any allocation request.
+	 */
+	size_t slab_max_alloc;
+
+	/*
+	 * When the number of active bytes in a hugepage is >=
+	 * hugification_threshold, we force hugify it.
+	 */
+	size_t hugification_threshold;
+
+	/*
+	 * The HPA purges whenever the number of pages exceeds dirty_mult *
+	 * active_pages.  This may be set to (fxp_t)-1 to disable purging.
+	 */
+	fxp_t dirty_mult;
+
+	/*
+	 * Whether or not the PAI methods are allowed to defer work to a
+	 * subsequent hpa_shard_do_deferred_work() call.  Practically, this
+	 * corresponds to background threads being enabled.  We track this
+	 * ourselves for encapsulation purposes.
+	 */
+	bool deferral_allowed;
+
+	/*
+	 * How long a hugepage has to be a hugification candidate before it will
+	 * actually get hugified.
+	 */
+	uint64_t hugify_delay_ms;
+
+	/*
+	 * Hugify pages synchronously (hugify will happen even if hugify_style
+	 * is not hpa_hugify_style_lazy).
+	 */
+	bool hugify_sync;
+
+	/*
+	 * Minimum amount of time between purges.
+	 */
+	uint64_t min_purge_interval_ms;
+
+	/*
+	 * Maximum number of hugepages to purge on each purging attempt.
+	 */
+	ssize_t experimental_max_purge_nhp;
+
+	/*
+	 * Minimum number of inactive bytes needed for a non-empty page to be
+	 * considered purgable.
+	 *
+	 * When the number of touched inactive bytes on non-empty hugepage is
+	 * >= purge_threshold, the page is purgable.  Empty pages are always
+	 * purgable.  Setting this to HUGEPAGE bytes would only purge empty
+	 * pages if using hugify_style_eager and the purges would be exactly
+	 * HUGEPAGE bytes.  Depending on your kernel settings, this may result
+	 * in better performance.
+	 *
+	 * Please note, when threshold is reached, we will purge all the dirty
+	 * bytes, and not just up to the threshold.  If this is PAGE bytes, then
+	 * all the pages that have any dirty bytes are purgable.  We treat
+	 * purgability constraint for purge_threshold as stronger than
+	 * dirty_mult, IOW, if no page meets purge_threshold, we will not purge
+	 * even if we are above dirty_mult.
+	 */
+	size_t purge_threshold;
+
+	/*
+	 * Minimum number of ms that needs to elapse between HP page becoming
+	 * eligible for purging and actually getting purged.
+	 *
+	 * Setting this to a larger number would give better chance of reusing
+	 * that memory.  Setting it to 0 means that page is eligible for purging
+	 * as soon as it meets the purge_threshold.  The clock resets when
+	 * purgability of the page changes (page goes from being non-purgable to
+	 * purgable).  When using eager style you probably want to allow for
+	 * some delay, to avoid purging the page too quickly and give it time to
+	 * be used.
+	 */
+	uint64_t min_purge_delay_ms;
+
+	/*
+	 * Style of hugification/dehugification (see comment at
+	 * hpa_hugify_style_t for options).
+	 */
+	hpa_hugify_style_t hugify_style;
+};
+
+/* clang-format off */
+#define HPA_SHARD_OPTS_DEFAULT {					\
+	/* slab_max_alloc */						\
+	64 * 1024,							\
+	/* hugification_threshold */					\
+	HUGEPAGE * 95 / 100,						\
+	/* dirty_mult */						\
+	FXP_INIT_PERCENT(25),						\
+	/*								\
+	 * deferral_allowed						\
+	 * 								\
+	 * Really, this is always set by the arena during creation	\
+	 * or by an hpa_shard_set_deferral_allowed call, so the value	\
+	 * we put here doesn't matter.					\
+	 */								\
+	false,								\
+	/* hugify_delay_ms */						\
+	10 * 1000,							\
+	/* hugify_sync */						\
+	false,								\
+	/* min_purge_interval_ms */					\
+	5 * 1000,							\
+	/* experimental_max_purge_nhp */				\
+	-1,      							\
+	/* size_t purge_threshold */					\
+	PAGE,								\
+	/* min_purge_delay_ms */             				\
+	0,  								\
+	/* hugify_style */                				\
+	hpa_hugify_style_lazy						\
+}
+/* clang-format on */
+
+#endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */
--- a/include/jemalloc/internal/hpa_utils.h
+++ b/include/jemalloc/internal/hpa_utils.h
@ -0,0 +1,161 @@
+#ifndef JEMALLOC_INTERNAL_HPA_UTILS_H
+#define JEMALLOC_INTERNAL_HPA_UTILS_H
+
+#include "jemalloc/internal/hpa.h"
+#include "jemalloc/internal/extent.h"
+
+#define HPA_MIN_VAR_VEC_SIZE 8
+/*
+ * This is used for jemalloc internal tuning and may change in the future based
+ * on production traffic.
+ *
+ * This value protects two things:
+ *    1. Stack size
+ *    2. Number of huge pages that are being purged in a batch as we do not
+ *       allow allocations while making madvise syscall.
+ */
+#define HPA_PURGE_BATCH_MAX 16
+
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+typedef struct iovec hpa_io_vector_t;
+#else
+typedef struct {
+	void  *iov_base;
+	size_t iov_len;
+} hpa_io_vector_t;
+#endif
+
+static inline size_t
+hpa_process_madvise_max_iovec_len(void) {
+	assert(
+	    opt_process_madvise_max_batch <= PROCESS_MADVISE_MAX_BATCH_LIMIT);
+	return opt_process_madvise_max_batch == 0
+	    ? HPA_MIN_VAR_VEC_SIZE
+	    : opt_process_madvise_max_batch;
+}
+
+/* Actually invoke hooks. If we fail vectorized, use single purges */
+static void
+hpa_try_vectorized_purge(
+    hpa_hooks_t *hooks, hpa_io_vector_t *vec, size_t vlen, size_t nbytes) {
+	bool success = opt_process_madvise_max_batch > 0
+	    && !hooks->vectorized_purge(vec, vlen, nbytes);
+	if (!success) {
+		/* On failure, it is safe to purge again (potential perf
+		 * penalty) If kernel can tell exactly which regions
+		 * failed, we could avoid that penalty.
+		 */
+		for (size_t i = 0; i < vlen; ++i) {
+			hooks->purge(vec[i].iov_base, vec[i].iov_len);
+		}
+	}
+}
+
+/*
+ * This structure accumulates the regions for process_madvise. It invokes the
+ * hook when batch limit is reached.
+ */
+typedef struct {
+	hpa_io_vector_t *vp;
+	size_t           cur;
+	size_t           total_bytes;
+	size_t           capacity;
+} hpa_range_accum_t;
+
+static inline void
+hpa_range_accum_init(hpa_range_accum_t *ra, hpa_io_vector_t *v, size_t sz) {
+	ra->vp = v;
+	ra->capacity = sz;
+	ra->total_bytes = 0;
+	ra->cur = 0;
+}
+
+static inline void
+hpa_range_accum_flush(hpa_range_accum_t *ra, hpa_hooks_t *hooks) {
+	assert(ra->total_bytes > 0 && ra->cur > 0);
+	hpa_try_vectorized_purge(hooks, ra->vp, ra->cur, ra->total_bytes);
+	ra->cur = 0;
+	ra->total_bytes = 0;
+}
+
+static inline void
+hpa_range_accum_add(
+    hpa_range_accum_t *ra, void *addr, size_t sz, hpa_hooks_t *hooks) {
+	assert(ra->cur < ra->capacity);
+
+	ra->vp[ra->cur].iov_base = addr;
+	ra->vp[ra->cur].iov_len = sz;
+	ra->total_bytes += sz;
+	ra->cur++;
+
+	if (ra->cur == ra->capacity) {
+		hpa_range_accum_flush(ra, hooks);
+	}
+}
+
+static inline void
+hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_hooks_t *hooks) {
+	if (ra->cur > 0) {
+		hpa_range_accum_flush(ra, hooks);
+	}
+}
+
+/*
+ * For purging more than one page we use batch of these items
+ */
+typedef struct {
+	hpdata_purge_state_t state;
+	hpdata_t            *hp;
+	bool                 dehugify;
+} hpa_purge_item_t;
+
+typedef struct hpa_purge_batch_s hpa_purge_batch_t;
+struct hpa_purge_batch_s {
+	hpa_purge_item_t *items;
+	size_t            items_capacity;
+	/* Number of huge pages to purge in current batch */
+	size_t item_cnt;
+	/* Number of ranges to purge in current batch */
+	size_t nranges;
+	/* Total number of dirty pages in current batch*/
+	size_t ndirty_in_batch;
+
+	/* Max number of huge pages to purge */
+	size_t max_hp;
+	/*
+	 * Once we are above this watermark we should not add more pages
+	 * to the same batch. This is because while we want to minimize
+	 * number of madvise calls we also do not want to be preventing
+	 * allocations from too many huge pages (which we have to do
+	 * while they are being purged)
+	 */
+	size_t range_watermark;
+
+	size_t npurged_hp_total;
+};
+
+static inline bool
+hpa_batch_full(hpa_purge_batch_t *b) {
+	/* It's okay for ranges to go above */
+	return b->npurged_hp_total == b->max_hp
+	    || b->item_cnt == b->items_capacity
+	    || b->nranges >= b->range_watermark;
+}
+
+static inline void
+hpa_batch_pass_start(hpa_purge_batch_t *b) {
+	b->item_cnt = 0;
+	b->nranges = 0;
+	b->ndirty_in_batch = 0;
+}
+
+static inline bool
+hpa_batch_empty(hpa_purge_batch_t *b) {
+	return b->item_cnt == 0;
+}
+
+/* Purge pages in a batch using given hooks */
+void hpa_purge_batch(
+    hpa_hooks_t *hooks, hpa_purge_item_t *batch, size_t batch_sz);
+
+#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */
--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -0,0 +1,486 @@
+#ifndef JEMALLOC_INTERNAL_HPDATA_H
+#define JEMALLOC_INTERNAL_HPDATA_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/pages.h"
+#include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/typed_list.h"
+
+/*
+ * The metadata representation we use for extents in hugepages.  While the PAC
+ * uses the edata_t to represent both active and inactive extents, the HP only
+ * uses the edata_t for active ones; instead, inactive extent state is tracked
+ * within hpdata associated with the enclosing hugepage-sized, hugepage-aligned
+ * region of virtual address space.
+ *
+ * An hpdata need not be "truly" backed by a hugepage (which is not necessarily
+ * an observable property of any given region of address space).  It's just
+ * hugepage-sized and hugepage-aligned; it's *potentially* huge.
+ */
+
+/*
+ * The max enumeration num should not exceed 2^16 - 1, see comments in edata.h
+ * for ESET_ENUMERATE_MAX_NUM for more details.
+ */
+#define PSSET_ENUMERATE_MAX_NUM 32
+typedef struct hpdata_s hpdata_t;
+ph_structs(hpdata_age_heap, hpdata_t, PSSET_ENUMERATE_MAX_NUM);
+struct hpdata_s {
+	/*
+	 * We likewise follow the edata convention of mangling names and forcing
+	 * the use of accessors -- this lets us add some consistency checks on
+	 * access.
+	 */
+
+	/*
+	 * The address of the hugepage in question.  This can't be named h_addr,
+	 * since that conflicts with a macro defined in Windows headers.
+	 */
+	void *h_address;
+	/* Its age (measured in psset operations). */
+	uint64_t h_age;
+	/* Whether or not we think the hugepage is mapped that way by the OS. */
+	bool h_huge;
+
+	/*
+	 * For some properties, we keep parallel sets of bools; h_foo_allowed
+	 * and h_in_psset_foo_container.  This is a decoupling mechanism to
+	 * avoid bothering the hpa (which manages policies) from the psset
+	 * (which is the mechanism used to enforce those policies).  This allows
+	 * all the container management logic to live in one place, without the
+	 * HPA needing to know or care how that happens.
+	 */
+
+	/*
+	 * Whether or not the hpdata is allowed to be used to serve allocations,
+	 * and whether or not the psset is currently tracking it as such.
+	 */
+	bool h_alloc_allowed;
+	bool h_in_psset_alloc_container;
+
+	/*
+	 * The same, but with purging.  There's no corresponding
+	 * h_in_psset_purge_container, because the psset (currently) always
+	 * removes hpdatas from their containers during updates (to implement
+	 * LRU for purging).
+	 */
+	bool h_purge_allowed;
+
+	/* And with hugifying. */
+	bool h_hugify_allowed;
+	/* When we became a hugification candidate. */
+	nstime_t h_time_hugify_allowed;
+	bool     h_in_psset_hugify_container;
+
+	/* Whether or not a purge or hugify is currently happening. */
+	bool h_mid_purge;
+	bool h_mid_hugify;
+
+	/*
+	 * Whether or not the hpdata is being updated in the psset (i.e. if
+	 * there has been a psset_update_begin call issued without a matching
+	 * psset_update_end call).  Eventually this will expand to other types
+	 * of updates.
+	 */
+	bool h_updating;
+
+	/* Whether or not the hpdata is in a psset. */
+	bool h_in_psset;
+
+	union {
+		/* When nonempty (and also nonfull), used by the psset bins. */
+		hpdata_age_heap_link_t age_link;
+		/*
+		 * When empty (or not corresponding to any hugepage), list
+		 * linkage.
+		 */
+		ql_elm(hpdata_t) ql_link_empty;
+	};
+
+	/*
+	 * Linkage for the psset to track candidates for purging and hugifying.
+	 */
+	ql_elm(hpdata_t) ql_link_purge;
+	ql_elm(hpdata_t) ql_link_hugify;
+
+	/* The length of the largest contiguous sequence of inactive pages. */
+	size_t h_longest_free_range;
+
+	/* Number of active pages. */
+	size_t h_nactive;
+
+	/* A bitmap with bits set in the active pages. */
+	fb_group_t active_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+
+	/*
+	 * Number of dirty or active pages, and a bitmap tracking them.  One
+	 * way to think of this is as which pages are dirty from the OS's
+	 * perspective.
+	 */
+	size_t h_ntouched;
+
+	/* The touched pages (using the same definition as above). */
+	fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+
+	/* Time when this extent (hpdata) becomes eligible for purging */
+	nstime_t h_time_purge_allowed;
+
+	/* True if the extent was huge and empty last time when it was purged */
+	bool h_purged_when_empty_and_huge;
+};
+
+TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
+TYPED_LIST(hpdata_purge_list, hpdata_t, ql_link_purge)
+TYPED_LIST(hpdata_hugify_list, hpdata_t, ql_link_hugify)
+
+ph_proto(, hpdata_age_heap, hpdata_t);
+
+static inline void *
+hpdata_addr_get(const hpdata_t *hpdata) {
+	return hpdata->h_address;
+}
+
+static inline void
+hpdata_addr_set(hpdata_t *hpdata, void *addr) {
+	assert(HUGEPAGE_ADDR2BASE(addr) == addr);
+	hpdata->h_address = addr;
+}
+
+static inline uint64_t
+hpdata_age_get(const hpdata_t *hpdata) {
+	return hpdata->h_age;
+}
+
+static inline void
+hpdata_age_set(hpdata_t *hpdata, uint64_t age) {
+	hpdata->h_age = age;
+}
+
+static inline bool
+hpdata_huge_get(const hpdata_t *hpdata) {
+	return hpdata->h_huge;
+}
+
+static inline bool
+hpdata_alloc_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_alloc_allowed;
+}
+
+static inline void
+hpdata_alloc_allowed_set(hpdata_t *hpdata, bool alloc_allowed) {
+	hpdata->h_alloc_allowed = alloc_allowed;
+}
+
+static inline bool
+hpdata_in_psset_alloc_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_alloc_container;
+}
+
+static inline void
+hpdata_in_psset_alloc_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_alloc_container);
+	hpdata->h_in_psset_alloc_container = in_container;
+}
+
+static inline bool
+hpdata_purge_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_purge_allowed;
+}
+
+static inline void
+hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) {
+	assert(purge_allowed == false || !hpdata->h_mid_purge);
+	hpdata->h_purge_allowed = purge_allowed;
+}
+
+static inline bool
+hpdata_hugify_allowed_get(const hpdata_t *hpdata) {
+	return hpdata->h_hugify_allowed;
+}
+
+static inline void
+hpdata_allow_hugify(hpdata_t *hpdata, nstime_t now) {
+	assert(!hpdata->h_mid_hugify);
+	hpdata->h_hugify_allowed = true;
+	hpdata->h_time_hugify_allowed = now;
+}
+
+static inline nstime_t
+hpdata_time_hugify_allowed(hpdata_t *hpdata) {
+	return hpdata->h_time_hugify_allowed;
+}
+
+static inline void
+hpdata_disallow_hugify(hpdata_t *hpdata) {
+	hpdata->h_hugify_allowed = false;
+}
+
+static inline bool
+hpdata_in_psset_hugify_container_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset_hugify_container;
+}
+
+static inline void
+hpdata_in_psset_hugify_container_set(hpdata_t *hpdata, bool in_container) {
+	assert(in_container != hpdata->h_in_psset_hugify_container);
+	hpdata->h_in_psset_hugify_container = in_container;
+}
+
+static inline bool
+hpdata_mid_purge_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_purge;
+}
+
+static inline void
+hpdata_mid_purge_set(hpdata_t *hpdata, bool mid_purge) {
+	assert(mid_purge != hpdata->h_mid_purge);
+	hpdata->h_mid_purge = mid_purge;
+}
+
+static inline bool
+hpdata_mid_hugify_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_hugify;
+}
+
+static inline void
+hpdata_mid_hugify_set(hpdata_t *hpdata, bool mid_hugify) {
+	assert(mid_hugify != hpdata->h_mid_hugify);
+	hpdata->h_mid_hugify = mid_hugify;
+}
+
+static inline bool
+hpdata_changing_state_get(const hpdata_t *hpdata) {
+	return hpdata->h_mid_purge || hpdata->h_mid_hugify;
+}
+
+static inline bool
+hpdata_updating_get(const hpdata_t *hpdata) {
+	return hpdata->h_updating;
+}
+
+static inline void
+hpdata_updating_set(hpdata_t *hpdata, bool updating) {
+	assert(updating != hpdata->h_updating);
+	hpdata->h_updating = updating;
+}
+
+static inline bool
+hpdata_in_psset_get(const hpdata_t *hpdata) {
+	return hpdata->h_in_psset;
+}
+
+static inline void
+hpdata_in_psset_set(hpdata_t *hpdata, bool in_psset) {
+	assert(in_psset != hpdata->h_in_psset);
+	hpdata->h_in_psset = in_psset;
+}
+
+static inline size_t
+hpdata_longest_free_range_get(const hpdata_t *hpdata) {
+	return hpdata->h_longest_free_range;
+}
+
+static inline void
+hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) {
+	assert(longest_free_range <= HUGEPAGE_PAGES);
+	hpdata->h_longest_free_range = longest_free_range;
+}
+
+static inline size_t
+hpdata_nactive_get(const hpdata_t *hpdata) {
+	return hpdata->h_nactive;
+}
+
+static inline size_t
+hpdata_ntouched_get(const hpdata_t *hpdata) {
+	return hpdata->h_ntouched;
+}
+
+static inline size_t
+hpdata_ndirty_get(const hpdata_t *hpdata) {
+	return hpdata->h_ntouched - hpdata->h_nactive;
+}
+
+static inline size_t
+hpdata_nretained_get(hpdata_t *hpdata) {
+	return HUGEPAGE_PAGES - hpdata->h_ntouched;
+}
+
+static inline void
+hpdata_time_purge_allowed_set(hpdata_t *hpdata, const nstime_t *v) {
+	nstime_copy(&hpdata->h_time_purge_allowed, v);
+}
+
+static inline const nstime_t *
+hpdata_time_purge_allowed_get(const hpdata_t *hpdata) {
+	return &hpdata->h_time_purge_allowed;
+}
+
+static inline bool
+hpdata_purged_when_empty_and_huge_get(const hpdata_t *hpdata) {
+	return hpdata->h_purged_when_empty_and_huge;
+}
+
+static inline void
+hpdata_purged_when_empty_and_huge_set(hpdata_t *hpdata, bool v) {
+	hpdata->h_purged_when_empty_and_huge = v;
+}
+
+static inline void
+hpdata_assert_empty(hpdata_t *hpdata) {
+	assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES));
+	assert(hpdata->h_nactive == 0);
+}
+
+/*
+ * Only used in tests, and in hpdata_assert_consistent, below.  Verifies some
+ * consistency properties of the hpdata (e.g. that cached counts of page stats
+ * match computed ones).
+ */
+static inline bool
+hpdata_consistent(hpdata_t *hpdata) {
+	bool res = true;
+
+	const size_t active_urange_longest = fb_urange_longest(
+	    hpdata->active_pages, HUGEPAGE_PAGES);
+	const size_t longest_free_range = hpdata_longest_free_range_get(hpdata);
+	if (active_urange_longest != longest_free_range) {
+		malloc_printf(
+		    "<jemalloc>: active_fb_urange_longest=%zu != hpdata_longest_free_range=%zu\n",
+		    active_urange_longest, longest_free_range);
+		res = false;
+	}
+
+	const size_t active_scount = fb_scount(
+	    hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
+	if (active_scount != hpdata->h_nactive) {
+		malloc_printf(
+		    "<jemalloc>: active_fb_scount=%zu != hpdata_nactive=%zu\n",
+		    active_scount, hpdata->h_nactive);
+		res = false;
+	}
+
+	const size_t touched_scount = fb_scount(
+	    hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
+	if (touched_scount != hpdata->h_ntouched) {
+		malloc_printf(
+		    "<jemalloc>: touched_fb_scount=%zu != hpdata_ntouched=%zu\n",
+		    touched_scount, hpdata->h_ntouched);
+		res = false;
+	}
+
+	if (hpdata->h_ntouched < hpdata->h_nactive) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_ntouched=%zu < hpdata_nactive=%zu\n",
+		    hpdata->h_ntouched, hpdata->h_nactive);
+		res = false;
+	}
+
+	if (hpdata->h_huge && (hpdata->h_ntouched != HUGEPAGE_PAGES)) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_huge=%d && (hpdata_ntouched=%zu != hugepage_pages=%zu)\n",
+		    hpdata->h_huge, hpdata->h_ntouched, HUGEPAGE_PAGES);
+		res = false;
+	}
+
+	const bool changing_state = hpdata_changing_state_get(hpdata);
+	if (changing_state
+	    && (hpdata->h_purge_allowed || hpdata->h_hugify_allowed)) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_changing_state=%d && (hpdata_purge_allowed=%d || hpdata_hugify_allowed=%d)\n",
+		    changing_state, hpdata->h_purge_allowed,
+		    hpdata->h_hugify_allowed);
+		res = false;
+	}
+
+	if (hpdata_hugify_allowed_get(hpdata)
+	    != hpdata_in_psset_hugify_container_get(hpdata)) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_hugify_allowed=%d != hpdata_in_psset_hugify_container=%d\n",
+		    hpdata_hugify_allowed_get(hpdata),
+		    hpdata_in_psset_hugify_container_get(hpdata));
+		res = false;
+	}
+
+	return res;
+}
+
+#define hpdata_assert_consistent(hpdata)                                       \
+	do {                                                                   \
+		assert(hpdata_consistent(hpdata));                             \
+	} while (0)
+
+static inline bool
+hpdata_empty(const hpdata_t *hpdata) {
+	return hpdata->h_nactive == 0;
+}
+
+static inline bool
+hpdata_full(const hpdata_t *hpdata) {
+	return hpdata->h_nactive == HUGEPAGE_PAGES;
+}
+
+void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge);
+
+/*
+ * Given an hpdata which can serve an allocation request, pick and reserve an
+ * offset within that allocation.
+ */
+void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz);
+void  hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz);
+
+/*
+ * The hpdata_purge_prepare_t allows grabbing the metadata required to purge
+ * subranges of a hugepage while holding a lock, drop the lock during the actual
+ * purging of them, and reacquire it to update the metadata again.
+ */
+typedef struct hpdata_purge_state_s hpdata_purge_state_t;
+struct hpdata_purge_state_s {
+	size_t     npurged;
+	size_t     ndirty_to_purge;
+	fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)];
+	size_t     next_purge_search_begin;
+};
+
+/*
+ * Initializes purge state.  The access to hpdata must be externally
+ * synchronized with other hpdata_* calls.
+ *
+ * You can tell whether or not a thread is purging or hugifying a given hpdata
+ * via hpdata_changing_state_get(hpdata).  Racing hugification or purging
+ * operations aren't allowed.
+ *
+ * Once you begin purging, you have to follow through and call hpdata_purge_next
+ * until you're done, and then end.  Allocating out of an hpdata undergoing
+ * purging is not allowed.
+ *
+ * Returns the number of dirty pages that will be purged and sets nranges
+ * to number of ranges with dirty pages that will be purged.
+ */
+size_t hpdata_purge_begin(
+    hpdata_t *hpdata, hpdata_purge_state_t *purge_state, size_t *nranges);
+
+/*
+ * If there are more extents to purge, sets *r_purge_addr and *r_purge_size to
+ * true, and returns true.  Otherwise, returns false to indicate that we're
+ * done.
+ *
+ * This requires exclusive access to the purge state, but *not* to the hpdata.
+ * In particular, unreserve calls are allowed while purging (i.e. you can dalloc
+ * into one part of the hpdata while purging a different part).
+ */
+bool hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
+    void **r_purge_addr, size_t *r_purge_size);
+/*
+ * Updates the hpdata metadata after all purging is done.  Needs external
+ * synchronization.
+ */
+void hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);
+
+void hpdata_hugify(hpdata_t *hpdata);
+void hpdata_dehugify(hpdata_t *hpdata);
+
+#endif /* JEMALLOC_INTERNAL_HPDATA_H */
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@ -1,35 +0,0 @@
-/******************************************************************************/
-#ifdef JEMALLOC_H_TYPES
-
-#endif /* JEMALLOC_H_TYPES */
-/******************************************************************************/
-#ifdef JEMALLOC_H_STRUCTS
-
-#endif /* JEMALLOC_H_STRUCTS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_EXTERNS
-
-void	*huge_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
-void	*huge_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
-    size_t alignment, bool zero);
-bool	huge_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize,
-    size_t usize_min, size_t usize_max, bool zero);
-void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
-    size_t usize, size_t alignment, bool zero, tcache_t *tcache);
-#ifdef JEMALLOC_JET
-typedef void (huge_dalloc_junk_t)(void *, size_t);
-extern huge_dalloc_junk_t *huge_dalloc_junk;
-#endif
-void	huge_dalloc(tsdn_t *tsdn, void *ptr);
-arena_t	*huge_aalloc(const void *ptr);
-size_t	huge_salloc(tsdn_t *tsdn, const void *ptr);
-prof_tctx_t	*huge_prof_tctx_get(tsdn_t *tsdn, const void *ptr);
-void	huge_prof_tctx_set(tsdn_t *tsdn, const void *ptr, prof_tctx_t *tctx);
-void	huge_prof_tctx_reset(tsdn_t *tsdn, const void *ptr);
-
-#endif /* JEMALLOC_H_EXTERNS */
-/******************************************************************************/
-#ifdef JEMALLOC_H_INLINES
-
-#endif /* JEMALLOC_H_INLINES */
-/******************************************************************************/
--- a/include/jemalloc/internal/inspect.h
+++ b/include/jemalloc/internal/inspect.h
@ -0,0 +1,43 @@
+#ifndef JEMALLOC_INTERNAL_INSPECT_H
+#define JEMALLOC_INTERNAL_INSPECT_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/*
+ * This module contains the heap introspection capabilities.  For now they are
+ * exposed purely through mallctl APIs in the experimental namespace, but this
+ * may change over time.
+ */
+
+/*
+ * The following two structs are for experimental purposes. See
+ * experimental_utilization_query_ctl and
+ * experimental_utilization_batch_query_ctl in src/ctl.c.
+ */
+typedef struct inspect_extent_util_stats_s inspect_extent_util_stats_t;
+struct inspect_extent_util_stats_s {
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+};
+
+typedef struct inspect_extent_util_stats_verbose_s
+    inspect_extent_util_stats_verbose_t;
+
+struct inspect_extent_util_stats_verbose_s {
+	void  *slabcur_addr;
+	size_t nfree;
+	size_t nregs;
+	size_t size;
+	size_t bin_nfree;
+	size_t bin_nregs;
+};
+
+void inspect_extent_util_stats_get(
+    tsdn_t *tsdn, const void *ptr, size_t *nfree, size_t *nregs, size_t *size);
+void inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr,
+    size_t *nfree, size_t *nregs, size_t *size, size_t *bin_nfree,
+    size_t *bin_nregs, void **slabcur_addr);
+
+#endif /* JEMALLOC_INTERNAL_INSPECT_H */
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@ -1,40 +1,67 @@
 #ifndef JEMALLOC_INTERNAL_DECLS_H
-#define	JEMALLOC_INTERNAL_DECLS_H
+#define JEMALLOC_INTERNAL_DECLS_H

 #include <math.h>
 #ifdef _WIN32
-#  include <windows.h>
-#  include "msvc_compat/windows_extra.h"
-
+#	include <windows.h>
+#	include "msvc_compat/windows_extra.h"
+#	include "msvc_compat/strings.h"
+#	ifdef _WIN64
+#		if LG_VADDR <= 32
+#			error Generate the headers using x64 vcargs
+#		endif
+#	else
+#		if LG_VADDR > 32
+#			undef LG_VADDR
+#			define LG_VADDR 32
+#		endif
+#	endif
 #else
-#  include <sys/param.h>
-#  include <sys/mman.h>
-#  if !defined(__pnacl__) && !defined(__native_client__)
-#    include <sys/syscall.h>
-#    if !defined(SYS_write) && defined(__NR_write)
-#      define SYS_write __NR_write
-#    endif
-#    include <sys/uio.h>
-#  endif
-#  include <pthread.h>
-#  ifdef JEMALLOC_OS_UNFAIR_LOCK
-#    include <os/lock.h>
-#  endif
-#  ifdef JEMALLOC_GLIBC_MALLOC_HOOK
-#    include <sched.h>
-#  endif
-#  include <errno.h>
-#  include <sys/time.h>
-#  include <time.h>
-#  ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
-#    include <mach/mach_time.h>
-#  endif
+#	include <sys/param.h>
+#	include <sys/mman.h>
+#	if !defined(__pnacl__) && !defined(__native_client__)
+#		include <sys/syscall.h>
+#		if !defined(SYS_write) && defined(__NR_write)
+#			define SYS_write __NR_write
+#		endif
+#		if defined(SYS_open) && defined(__aarch64__)
+/* Android headers may define SYS_open to __NR_open even though
+        * __NR_open may not exist on AArch64 (superseded by __NR_openat). */
+#			undef SYS_open
+#		endif
+#		include <sys/uio.h>
+#	endif
+#	include <pthread.h>
+#	if defined(__FreeBSD__) || defined(__DragonFly__)                     \
+	    || defined(__OpenBSD__)
+#		include <pthread_np.h>
+#		include <sched.h>
+#		if defined(__FreeBSD__)
+#			define cpu_set_t cpuset_t
+#		endif
+#	endif
+#	include <signal.h>
+#	ifdef JEMALLOC_OS_UNFAIR_LOCK
+#		include <os/lock.h>
+#	endif
+#	ifdef JEMALLOC_GLIBC_MALLOC_HOOK
+#		include <sched.h>
+#	endif
+#	include <errno.h>
+#	include <sys/time.h>
+#	include <time.h>
+#	ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+#		include <mach/mach_time.h>
+#	endif
 #endif
 #include <sys/types.h>

 #include <limits.h>
 #ifndef SIZE_T_MAX
-#  define SIZE_T_MAX	SIZE_MAX
+#	define SIZE_T_MAX SIZE_MAX
+#endif
+#ifndef SSIZE_MAX
+#	define SSIZE_MAX ((ssize_t)(SIZE_T_MAX >> 1))
 #endif
 #include <stdarg.h>
 #include <stdbool.h>
@ -43,33 +70,57 @@
 #include <stdint.h>
 #include <stddef.h>
 #ifndef offsetof
-#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#	define offsetof(type, member) ((size_t) & (((type *)NULL)->member))
 #endif
 #include <string.h>
 #include <strings.h>
 #include <ctype.h>
 #ifdef _MSC_VER
-#  include <io.h>
+#	include <io.h>
 typedef intptr_t ssize_t;
-#  define PATH_MAX 1024
-#  define STDERR_FILENO 2
-#  define __func__ __FUNCTION__
-#  ifdef JEMALLOC_HAS_RESTRICT
-#    define restrict __restrict
-#  endif
+#	define PATH_MAX 1024
+#	define STDERR_FILENO 2
+#	define __func__ __FUNCTION__
+#	ifdef JEMALLOC_HAS_RESTRICT
+#		define restrict __restrict
+#	endif
 /* Disable warnings about deprecated system functions. */
-#  pragma warning(disable: 4996)
-#if _MSC_VER < 1800
+#	pragma warning(disable : 4996)
+#	if _MSC_VER < 1800
 static int
-isblank(int c)
-{
-
+isblank(int c) {
 	return (c == '\t' || c == ' ');
 }
-#endif
+#	endif
 #else
-#  include <unistd.h>
+#	include <unistd.h>
 #endif
 #include <fcntl.h>

+/*
+ * The Win32 midl compiler has #define small char; we don't use midl, but
+ * "small" is a nice identifier to have available when talking about size
+ * classes.
+ */
+#ifdef small
+#	undef small
+#endif
+
+/*
+ * Oftentimes we'd like to perform some kind of arithmetic to obtain
+ * a pointer from another pointer but with some offset or mask applied.
+ * Naively you would accomplish this by casting the source pointer to
+ * `uintptr_t`, performing all of the relevant arithmetic, and then casting
+ * the result to the desired pointer type. However, this has the unfortunate
+ * side-effect of concealing pointer provenance, hiding useful information for
+ * optimization from the compiler (see here for details:
+ * https://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html
+ * )
+ * Instead what one should do is cast the source pointer to `char *` and perform
+ * the equivalent arithmetic (since `char` of course represents one byte). But
+ * because `char *` has the semantic meaning of "string", we define this typedef
+ * simply to make it clearer where we are performing such pointer arithmetic.
+ */
+typedef char byte_t;
+
 #endif /* JEMALLOC_INTERNAL_H */
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -1,5 +1,5 @@
 #ifndef JEMALLOC_INTERNAL_DEFS_H_
-#define	JEMALLOC_INTERNAL_DEFS_H_
+#define JEMALLOC_INTERNAL_DEFS_H_
 /*
 * If JEMALLOC_PREFIX is defined via --with-jemalloc-prefix, it will cause all
 * public APIs to be prefixed.  This makes it possible, with some care, to use
@ -8,6 +8,21 @@
 #undef JEMALLOC_PREFIX
 #undef JEMALLOC_CPREFIX

+/*
+ * Define overrides for non-standard allocator-related functions if they are
+ * present on the system.
+ */
+#undef JEMALLOC_OVERRIDE___LIBC_CALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_FREE
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_SIZED
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED
+#undef JEMALLOC_OVERRIDE___LIBC_MALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN
+#undef JEMALLOC_OVERRIDE___LIBC_REALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_VALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_PVALLOC
+#undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN
+
 /*
 * JEMALLOC_PRIVATE_NAMESPACE is used as a prefix for all library-private APIs.
 * For shared libraries, symbol visibility mechanisms prevent these symbols
@ -21,58 +36,41 @@
 * order to yield to another virtual CPU.
 */
 #undef CPU_SPINWAIT
+/* 1 if CPU_SPINWAIT is defined, 0 otherwise. */
+#undef HAVE_CPU_SPINWAIT
+
+/*
+ * Number of significant bits in virtual addresses.  This may be less than the
+ * total number of bits in a pointer, e.g. on x64, for which the uppermost 16
+ * bits are the same as bit 47.
+ */
+#undef LG_VADDR

 /* Defined if C11 atomics are available. */
-#undef JEMALLOC_C11ATOMICS
+#undef JEMALLOC_C11_ATOMICS

-/* Defined if the equivalent of FreeBSD's atomic(9) functions are available. */
-#undef JEMALLOC_ATOMIC9
+/* Defined if GCC __atomic atomics are available. */
+#undef JEMALLOC_GCC_ATOMIC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_ATOMIC_ATOMICS

-/*
- * Defined if OSAtomic*() functions are available, as provided by Darwin, and
- * documented in the atomic(3) manual page.
- */
-#undef JEMALLOC_OSATOMIC
-
-/*
- * Defined if __sync_add_and_fetch(uint32_t *, uint32_t) and
- * __sync_sub_and_fetch(uint32_t *, uint32_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_4
-
-/*
- * Defined if __sync_add_and_fetch(uint64_t *, uint64_t) and
- * __sync_sub_and_fetch(uint64_t *, uint64_t) are available, despite
- * __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 not being defined (which means the
- * functions are defined in libgcc instead of being inlines).
- */
-#undef JE_FORCE_SYNC_COMPARE_AND_SWAP_8
+/* Defined if GCC __sync atomics are available. */
+#undef JEMALLOC_GCC_SYNC_ATOMICS
+/* and the 8-bit variant support. */
+#undef JEMALLOC_GCC_U8_SYNC_ATOMICS

 /*
 * Defined if __builtin_clz() and __builtin_clzl() are available.
 */
 #undef JEMALLOC_HAVE_BUILTIN_CLZ

-/*
- * Defined if madvise(2) is available.
- */
-#undef JEMALLOC_HAVE_MADVISE
-
 /*
 * Defined if os_unfair_lock_*() functions are available, as provided by Darwin.
 */
 #undef JEMALLOC_OS_UNFAIR_LOCK

-/*
- * Defined if OSSpin*() functions are available, as provided by Darwin, and
- * documented in the spinlock(3) manual page.
- */
-#undef JEMALLOC_OSSPIN
-
-/* Defined if syscall(2) is available. */
-#undef JEMALLOC_HAVE_SYSCALL
+/* Defined if syscall(2) is usable. */
+#undef JEMALLOC_USE_SYSCALL

 /*
 * Defined if secure_getenv(3) is available.
@ -84,6 +82,21 @@
 */
 #undef JEMALLOC_HAVE_ISSETUGID

+/* Defined if pthread_atfork(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_ATFORK
+
+/* Defined if pthread_setname_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_SETNAME_NP
+
+/* Defined if pthread_getname_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP
+
+/* Defined if pthread_set_name_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_SET_NAME_NP
+
+/* Defined if pthread_get_name_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP
+
 /*
 * Defined if clock_gettime(CLOCK_MONOTONIC_COARSE, ...) is available.
 */
@ -99,6 +112,16 @@
 */
 #undef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME

+/*
+ * Defined if clock_gettime(CLOCK_REALTIME, ...) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_REALTIME
+
+/*
+ * Defined if clock_gettime_nsec_np(CLOCK_UPTIME_RAW) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_GETTIME_NSEC_NP
+
 /*
 * Defined if _malloc_thread_cleanup() exists.  At least in the case of
 * FreeBSD, pthread_key_create() allocates, which if used during malloc
@ -125,12 +148,6 @@
 /* Non-empty if the tls_model attribute is supported. */
 #undef JEMALLOC_TLS_MODEL

-/* JEMALLOC_CC_SILENCE enables code that silences unuseful compiler warnings. */
-#undef JEMALLOC_CC_SILENCE
-
-/* JEMALLOC_CODE_COVERAGE enables test code coverage analysis. */
-#undef JEMALLOC_CODE_COVERAGE
-
 /*
 * JEMALLOC_DEBUG enables assertions and other sanity checks, and disables
 * inline functions.
@ -140,6 +157,9 @@
 /* JEMALLOC_STATS enables statistics calculation. */
 #undef JEMALLOC_STATS

+/* JEMALLOC_EXPERIMENTAL_SMALLOCX_API enables experimental smallocx API. */
+#undef JEMALLOC_EXPERIMENTAL_SMALLOCX_API
+
 /* JEMALLOC_PROF enables allocation profiling. */
 #undef JEMALLOC_PROF

@ -152,27 +172,29 @@
 /* Use gcc intrinsics for profile backtracing if defined. */
 #undef JEMALLOC_PROF_GCC

-/*
- * JEMALLOC_TCACHE enables a thread-specific caching layer for small objects.
- * This makes it possible to allocate/deallocate objects without any locking
- * when the cache is in the steady state.
- */
-#undef JEMALLOC_TCACHE
+/* Use frame pointer for profile backtracing if defined. Linux only. */
+#undef JEMALLOC_PROF_FRAME_POINTER
+
+/* JEMALLOC_PAGEID enabled page id */
+#undef JEMALLOC_PAGEID
+
+/* JEMALLOC_HAVE_PRCTL checks prctl */
+#undef JEMALLOC_HAVE_PRCTL

 /*
- * JEMALLOC_DSS enables use of sbrk(2) to allocate chunks from the data storage
+ * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
 * segment (DSS).
 */
 #undef JEMALLOC_DSS

-/* Support memory filling (junk/zero/quarantine/redzone). */
+/* Support memory filling (junk/zero). */
 #undef JEMALLOC_FILL

 /* Support utrace(2)-based tracing. */
 #undef JEMALLOC_UTRACE

-/* Support Valgrind. */
-#undef JEMALLOC_VALGRIND
+/* Support utrace(2)-based tracing (label based signature). */
+#undef JEMALLOC_UTRACE_LABEL

 /* Support optional abort() on OOM. */
 #undef JEMALLOC_XMALLOC
@ -180,9 +202,6 @@
 /* Support lazy locking (avoid locking unless a second thread is launched). */
 #undef JEMALLOC_LAZY_LOCK

-/* Minimum size class to support is 2^LG_TINY_MIN bytes. */
-#undef LG_TINY_MIN
-
 /*
 * Minimum allocation alignment is 2^LG_QUANTUM bytes (ignoring tiny size
 * classes).
@ -192,6 +211,16 @@
 /* One page is 2^LG_PAGE bytes. */
 #undef LG_PAGE

+/* Maximum number of regions in a slab. */
+#undef CONFIG_LG_SLAB_MAXREGS
+
+/*
+ * One huge page is 2^LG_HUGEPAGE bytes.  Note that this is defined even if the
+ * system does not explicitly support huge pages; system calls that require
+ * explicit huge page support are separately configured.
+ */
+#undef LG_HUGEPAGE
+
 /*
 * If defined, adjacent virtual memory mappings with identical attributes
 * automatically coalesce, and they fragment when changes are made to subranges.
@ -202,11 +231,12 @@
 #undef JEMALLOC_MAPS_COALESCE

 /*
- * If defined, use munmap() to unmap freed chunks, rather than storing them for
- * later reuse.  This is disabled by default on Linux because common sequences
- * of mmap()/munmap() calls will cause virtual memory map holes.
+ * If defined, retain memory for later reuse by default rather than using e.g.
+ * munmap() to unmap freed extents.  This is enabled on 64-bit Linux because
+ * common sequences of mmap()/munmap() calls will cause virtual memory map
+ * holes.
 */
-#undef JEMALLOC_MUNMAP
+#undef JEMALLOC_RETAIN

 /* TLS is used to map arenas and magazine caches to threads. */
 #undef JEMALLOC_TLS
@ -226,10 +256,10 @@
 #undef JEMALLOC_INTERNAL_FFS

 /*
- * JEMALLOC_IVSALLOC enables ivsalloc(), which verifies that pointers reside
- * within jemalloc-owned chunks before dereferencing them.
+ * popcount*() functions to use for bitmapping.
 */
-#undef JEMALLOC_IVSALLOC
+#undef JEMALLOC_INTERNAL_POPCOUNTL
+#undef JEMALLOC_INTERNAL_POPCOUNT

 /*
 * If defined, explicitly attempt to more uniformly distribute large allocation
@ -237,11 +267,28 @@
 */
 #undef JEMALLOC_CACHE_OBLIVIOUS

+/*
+ * If defined, enable logging facilities.  We make this a configure option to
+ * avoid taking extra branches everywhere.
+ */
+#undef JEMALLOC_LOG
+
+/*
+ * If defined, use readlinkat() (instead of readlink()) to follow
+ * /etc/malloc_conf.
+ */
+#undef JEMALLOC_READLINKAT
+
+/*
+ * If defined, use getenv() (instead of secure_getenv() or
+ * alternatives) to access MALLOC_CONF.
+ */
+#undef JEMALLOC_FORCE_GETENV
+
 /*
 * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
 */
 #undef JEMALLOC_ZONE
-#undef JEMALLOC_ZONE_VERSION

 /*
 * Methods for determining whether the OS overcommits.
@ -252,18 +299,95 @@
 #undef JEMALLOC_SYSCTL_VM_OVERCOMMIT
 #undef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY

+/* Defined if madvise(2) is available. */
+#undef JEMALLOC_HAVE_MADVISE
+
+/*
+ * Defined if transparent huge pages are supported via the MADV_[NO]HUGEPAGE
+ * arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_HUGE
+
+/*
+ * Defined if best-effort synchronous collapse of the native
+ * pages mapped by the memory range into transparent huge pages is supported
+ * via MADV_COLLAPSE arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_COLLAPSE
+
 /*
 * Methods for purging unused pages differ between operating systems.
 *
- *   madvise(..., MADV_DONTNEED) : On Linux, this immediately discards pages,
+ *   madvise(..., MADV_FREE) : This marks pages as being unused, such that they
+ *                             will be discarded rather than swapped out.
+ *   madvise(..., MADV_DONTNEED) : If JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS is
+ *                                 defined, this immediately discards pages,
 *                                 such that new pages will be demand-zeroed if
- *                                 the address region is later touched.
- *   madvise(..., MADV_FREE) : On FreeBSD and Darwin, this marks pages as being
- *                             unused, such that they will be discarded rather
- *                             than swapped out.
+ *                                 the address region is later touched;
+ *                                 otherwise this behaves similarly to
+ *                                 MADV_FREE, though typically with higher
+ *                                 system overhead.
 */
-#undef JEMALLOC_PURGE_MADVISE_DONTNEED
 #undef JEMALLOC_PURGE_MADVISE_FREE
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
+
+/* Defined if madvise(2) is available but MADV_FREE is not (x86 Linux only). */
+#undef JEMALLOC_DEFINE_MADVISE_FREE
+
+/*
+ * Defined if MADV_DO[NT]DUMP is supported as an argument to madvise.
+ */
+#undef JEMALLOC_MADVISE_DONTDUMP
+
+/*
+ * Defined if MADV_[NO]CORE is supported as an argument to madvise.
+ */
+#undef JEMALLOC_MADVISE_NOCORE
+
+/* Defined if process_madvise(2) is available. */
+#undef JEMALLOC_HAVE_PROCESS_MADVISE
+
+#undef EXPERIMENTAL_SYS_PROCESS_MADVISE_NR
+
+/* Defined if mprotect(2) is available. */
+#undef JEMALLOC_HAVE_MPROTECT
+
+/* Defined if sys/sdt.h is available and sdt tracing enabled */
+#undef JEMALLOC_EXPERIMENTAL_USDT_STAP
+
+/*
+ * Defined if sys/sdt.h is unavailable, sdt tracing enabled, and
+ * platform is supported
+ */
+#undef JEMALLOC_EXPERIMENTAL_USDT_CUSTOM
+
+/*
+ * Defined if transparent huge pages (THPs) are supported via the
+ * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
+ */
+#undef JEMALLOC_THP
+
+/* Defined if posix_madvise is available. */
+#undef JEMALLOC_HAVE_POSIX_MADVISE
+
+/*
+ * Method for purging unused pages using posix_madvise.
+ *
+ *   posix_madvise(..., POSIX_MADV_DONTNEED)
+ */
+#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED
+#undef JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS
+
+/*
+ * Defined if memcntl page admin call is supported
+ */
+#undef JEMALLOC_HAVE_MEMCNTL
+
+/*
+ * Defined if malloc_size is supported
+ */
+#undef JEMALLOC_HAVE_MALLOC_SIZE

 /* Define if operating system has alloca.h header. */
 #undef JEMALLOC_HAS_ALLOCA_H
@ -292,9 +416,32 @@
 /* glibc memalign hook. */
 #undef JEMALLOC_GLIBC_MEMALIGN_HOOK

+/* pthread support */
+#undef JEMALLOC_HAVE_PTHREAD
+
+/* dlsym() support */
+#undef JEMALLOC_HAVE_DLSYM
+
 /* Adaptive mutex support in pthreads. */
 #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP

+/* gettid() support */
+#undef JEMALLOC_HAVE_GETTID
+
+/* GNU specific sched_getcpu support */
+#undef JEMALLOC_HAVE_SCHED_GETCPU
+
+/* GNU specific sched_setaffinity support */
+#undef JEMALLOC_HAVE_SCHED_SETAFFINITY
+
+/* pthread_setaffinity_np support */
+#undef JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP
+
+/*
+ * If defined, all the features necessary for background threads are present.
+ */
+#undef JEMALLOC_BACKGROUND_THREAD
+
 /*
 * If defined, jemalloc symbols are not exported (doesn't work when
 * JEMALLOC_PREFIX is not defined).
@ -304,4 +451,44 @@
 /* config.malloc_conf options string. */
 #undef JEMALLOC_CONFIG_MALLOC_CONF

+/* If defined, jemalloc takes the malloc/free/etc. symbol names. */
+#undef JEMALLOC_IS_MALLOC
+
+/*
+ * Defined if strerror_r returns char * if _GNU_SOURCE is defined.
+ */
+#undef JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE
+
+/* Performs additional safety checks when defined. */
+#undef JEMALLOC_OPT_SAFETY_CHECKS
+
+/* Is C++ support being built? */
+#undef JEMALLOC_ENABLE_CXX
+
+/* Performs additional size checks when defined. */
+#undef JEMALLOC_OPT_SIZE_CHECKS
+
+/* Allows sampled junk and stash for checking use-after-free when defined. */
+#undef JEMALLOC_UAF_DETECTION
+
+/* Darwin VM_MAKE_TAG support */
+#undef JEMALLOC_HAVE_VM_MAKE_TAG
+
+/* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */
+#undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE
+
+/* If defined, use volatile asm during benchmarks. */
+#undef JEMALLOC_HAVE_ASM_VOLATILE
+
+/*
+ * If defined, support the use of rdtscp to get the time stamp counter
+ * and the processor ID.
+ */
+#undef JEMALLOC_HAVE_RDTSCP
+
+/* If defined, use __int128 for optimization. */
+#undef JEMALLOC_HAVE_INT128
+
+#include "jemalloc/internal/jemalloc_internal_overrides.h"
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@ -0,0 +1,91 @@
+#ifndef JEMALLOC_INTERNAL_EXTERNS_H
+#define JEMALLOC_INTERNAL_EXTERNS_H
+
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/fxp.h"
+#include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/sec_opts.h"
+#include "jemalloc/internal/tsd_types.h"
+
+/* TSD checks this to set thread local slow state accordingly. */
+extern bool malloc_slow;
+
+/* Run-time options. */
+extern bool             opt_abort;
+extern bool             opt_abort_conf;
+extern bool             opt_trust_madvise;
+extern bool             opt_experimental_hpa_start_huge_if_thp_always;
+extern bool             opt_experimental_hpa_enforce_hugify;
+extern bool             opt_confirm_conf;
+extern bool             opt_hpa;
+extern hpa_shard_opts_t opt_hpa_opts;
+extern sec_opts_t       opt_hpa_sec_opts;
+
+extern const char *opt_junk;
+extern bool        opt_junk_alloc;
+extern bool        opt_junk_free;
+extern void (*JET_MUTABLE junk_free_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE junk_alloc_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE invalid_conf_abort)(void);
+extern bool                  opt_utrace;
+extern bool                  opt_xmalloc;
+extern bool                  opt_experimental_infallible_new;
+extern bool                  opt_experimental_tcache_gc;
+extern bool                  opt_zero;
+extern unsigned              opt_narenas;
+extern fxp_t                 opt_narenas_ratio;
+extern zero_realloc_action_t opt_zero_realloc_action;
+extern malloc_init_t         malloc_init_state;
+extern const char *const     zero_realloc_mode_names[];
+extern atomic_zu_t           zero_realloc_count;
+extern bool                  opt_cache_oblivious;
+extern unsigned              opt_debug_double_free_max_scan;
+extern size_t                opt_calloc_madvise_threshold;
+extern bool                  opt_disable_large_size_classes;
+
+extern const char *opt_malloc_conf_symlink;
+extern const char *opt_malloc_conf_env_var;
+
+/* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */
+extern uintptr_t san_cache_bin_nonfast_mask;
+
+/* Number of CPUs. */
+extern unsigned ncpus;
+
+/* Number of arenas used for automatic multiplexing of threads and arenas. */
+extern unsigned narenas_auto;
+
+/* Base index for manual arenas. */
+extern unsigned manual_arena_base;
+
+/*
+ * Arenas that are used to service external requests.  Not all elements of the
+ * arenas array are necessarily used; arenas are created lazily as needed.
+ */
+extern atomic_p_t arenas[];
+
+extern unsigned huge_arena_ind;
+
+void    *a0malloc(size_t size);
+void     a0dalloc(void *ptr);
+void    *bootstrap_malloc(size_t size);
+void    *bootstrap_calloc(size_t num, size_t size);
+void     bootstrap_free(void *ptr);
+void     arena_set(unsigned ind, arena_t *arena);
+unsigned narenas_total_get(void);
+arena_t *arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
+arena_t *arena_choose_hard(tsd_t *tsd, bool internal);
+void     arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena);
+void     iarena_cleanup(tsd_t *tsd);
+void     arena_cleanup(tsd_t *tsd);
+size_t   batch_alloc(void **ptrs, size_t num, size_t size, int flags);
+void     jemalloc_prefork(void);
+void     jemalloc_postfork_parent(void);
+void     jemalloc_postfork_child(void);
+void     sdallocx_default(void *ptr, size_t size, int flags);
+void     free_default(void *ptr);
+void    *malloc_default(size_t size);
+
+#endif /* JEMALLOC_INTERNAL_EXTERNS_H */
--- a/include/jemalloc/internal/jemalloc_internal_includes.h
+++ b/include/jemalloc/internal/jemalloc_internal_includes.h
@ -0,0 +1,84 @@
+#ifndef JEMALLOC_INTERNAL_INCLUDES_H
+#define JEMALLOC_INTERNAL_INCLUDES_H
+
+/*
+ * jemalloc can conceptually be broken into components (arena, tcache, etc.),
+ * but there are circular dependencies that cannot be broken without
+ * substantial performance degradation.
+ *
+ * Historically, we dealt with this by each header into four sections (types,
+ * structs, externs, and inlines), and included each header file multiple times
+ * in this file, picking out the portion we want on each pass using the
+ * following #defines:
+ *   JEMALLOC_H_TYPES   : Preprocessor-defined constants and pseudo-opaque data
+ *                        types.
+ *   JEMALLOC_H_STRUCTS : Data structures.
+ *   JEMALLOC_H_EXTERNS : Extern data declarations and function prototypes.
+ *   JEMALLOC_H_INLINES : Inline functions.
+ *
+ * We're moving toward a world in which the dependencies are explicit; each file
+ * will #include the headers it depends on (rather than relying on them being
+ * implicitly available via this file including every header file in the
+ * project).
+ *
+ * We're now in an intermediate state: we've broken up the header files to avoid
+ * having to include each one multiple times, but have not yet moved the
+ * dependency information into the header files (i.e. we still rely on the
+ * ordering in this file to ensure all a header's dependencies are available in
+ * its translation unit).  Each component is now broken up into multiple header
+ * files, corresponding to the sections above (e.g. instead of "foo.h", we now
+ * have "foo_types.h", "foo_structs.h", "foo_externs.h", "foo_inlines.h").
+ *
+ * Those files which have been converted to explicitly include their
+ * inter-component dependencies are now in the initial HERMETIC HEADERS
+ * section.  All headers may still rely on jemalloc_preamble.h (which, by fiat,
+ * must be included first in every translation unit) for system headers and
+ * global jemalloc definitions, however.
+ */
+
+/******************************************************************************/
+/* TYPES */
+/******************************************************************************/
+
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/tcache_types.h"
+#include "jemalloc/internal/prof_types.h"
+
+/******************************************************************************/
+/* STRUCTS */
+/******************************************************************************/
+
+#include "jemalloc/internal/prof_structs.h"
+#include "jemalloc/internal/arena_structs.h"
+#include "jemalloc/internal/tcache_structs.h"
+#include "jemalloc/internal/background_thread_structs.h"
+
+/******************************************************************************/
+/* EXTERNS */
+/******************************************************************************/
+
+#include "jemalloc/internal/jemalloc_internal_externs.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/large_externs.h"
+#include "jemalloc/internal/tcache_externs.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/background_thread_externs.h"
+
+/******************************************************************************/
+/* INLINES */
+/******************************************************************************/
+
+#include "jemalloc/internal/jemalloc_internal_inlines_a.h"
+/*
+ * Include portions of arena code interleaved with tcache code in order to
+ * resolve circular dependencies.
+ */
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_b.h"
+#include "jemalloc/internal/tcache_inlines.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_c.h"
+#include "jemalloc/internal/prof_inlines.h"
+#include "jemalloc/internal/background_thread_inlines.h"
+
+#endif /* JEMALLOC_INTERNAL_INCLUDES_H */
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@ -0,0 +1,135 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_A_H
+#define JEMALLOC_INTERNAL_INLINES_A_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/bit_util.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/tcache_externs.h"
+#include "jemalloc/internal/ticker.h"
+
+JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
+malloc_getcpu(void) {
+	assert(have_percpu_arena);
+#if defined(_WIN32)
+	return GetCurrentProcessorNumber();
+#elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
+	return (malloc_cpuid_t)sched_getcpu();
+#elif defined(JEMALLOC_HAVE_RDTSCP)
+	unsigned int ecx;
+	asm volatile("rdtscp" : "=c"(ecx)::"eax", "edx");
+	return (malloc_cpuid_t)(ecx & 0xfff);
+#elif defined(__aarch64__) && defined(__APPLE__)
+	/* Other oses most likely use tpidr_el0 instead */
+	uintptr_t c;
+	asm volatile("mrs %x0, tpidrro_el0" : "=r"(c)::"memory");
+	return (malloc_cpuid_t)(c & (1 << 3) - 1);
+#else
+	not_reached();
+	return -1;
+#endif
+}
+
+/* Return the chosen arena index based on current cpu. */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_choose(void) {
+	assert(have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena));
+
+	malloc_cpuid_t cpuid = malloc_getcpu();
+	assert(cpuid >= 0);
+
+	unsigned arena_ind;
+	if ((opt_percpu_arena == percpu_arena)
+	    || ((unsigned)cpuid < ncpus / 2)) {
+		arena_ind = cpuid;
+	} else {
+		assert(opt_percpu_arena == per_phycpu_arena);
+		/* Hyper threads on the same physical CPU share arena. */
+		arena_ind = cpuid - ncpus / 2;
+	}
+
+	return arena_ind;
+}
+
+/* Return the limit of percpu auto arena range, i.e. arenas[0...ind_limit). */
+JEMALLOC_ALWAYS_INLINE unsigned
+percpu_arena_ind_limit(percpu_arena_mode_t mode) {
+	assert(have_percpu_arena && PERCPU_ARENA_ENABLED(mode));
+	if (mode == per_phycpu_arena && ncpus > 1) {
+		if (ncpus % 2) {
+			/* This likely means a misconfig. */
+			return ncpus / 2 + 1;
+		}
+		return ncpus / 2;
+	} else {
+		return ncpus;
+	}
+}
+
+static inline arena_t *
+arena_get(tsdn_t *tsdn, unsigned ind, bool init_if_missing) {
+	arena_t *ret;
+
+	assert(ind < MALLOCX_ARENA_LIMIT);
+
+	ret = (arena_t *)atomic_load_p(&arenas[ind], ATOMIC_ACQUIRE);
+	if (unlikely(ret == NULL)) {
+		if (init_if_missing) {
+			ret = arena_init(tsdn, ind, &arena_config_default);
+		}
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+tcache_available(tsd_t *tsd) {
+	/*
+	 * Thread specific auto tcache might be unavailable if: 1) during tcache
+	 * initialization, or 2) disabled through thread.tcache.enabled mallctl
+	 * or config options.  This check covers all cases.
+	 */
+	if (likely(tsd_tcache_enabled_get(tsd))) {
+		/* Associated arena == NULL implies tcache init in progress. */
+		if (config_debug && tsd_tcache_slowp_get(tsd)->arena != NULL) {
+			tcache_assert_initialized(tsd_tcachep_get(tsd));
+		}
+		return true;
+	}
+
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcache_get(tsd_t *tsd) {
+	if (!tcache_available(tsd)) {
+		return NULL;
+	}
+
+	return tsd_tcachep_get(tsd);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_slow_t *
+tcache_slow_get(tsd_t *tsd) {
+	if (!tcache_available(tsd)) {
+		return NULL;
+	}
+
+	return tsd_tcache_slowp_get(tsd);
+}
+
+static inline void
+pre_reentrancy(tsd_t *tsd, arena_t *arena) {
+	/* arena is the current context.  Reentry from a0 is not allowed. */
+	assert(arena != arena_get(tsd_tsdn(tsd), 0, false));
+	tsd_pre_reentrancy_raw(tsd);
+}
+
+static inline void
+post_reentrancy(tsd_t *tsd) {
+	tsd_post_reentrancy_raw(tsd);
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_A_H */
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@ -0,0 +1,106 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_B_H
+#define JEMALLOC_INTERNAL_INLINES_B_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/extent.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_a.h"
+
+static inline void
+percpu_arena_update(tsd_t *tsd, unsigned cpu) {
+	assert(have_percpu_arena);
+	arena_t *oldarena = tsd_arena_get(tsd);
+	assert(oldarena != NULL);
+	unsigned oldind = arena_ind_get(oldarena);
+
+	if (oldind != cpu) {
+		unsigned newind = cpu;
+		arena_t *newarena = arena_get(tsd_tsdn(tsd), newind, true);
+		assert(newarena != NULL);
+
+		/* Set new arena/tcache associations. */
+		arena_migrate(tsd, oldarena, newarena);
+		tcache_t *tcache = tcache_get(tsd);
+		if (tcache != NULL) {
+			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
+			assert(tcache_slow->arena != NULL);
+			tcache_arena_reassociate(
+			    tsd_tsdn(tsd), tcache_slow, tcache, newarena);
+		}
+	}
+}
+
+/* Choose an arena based on a per-thread value. */
+static inline arena_t *
+arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
+	arena_t *ret;
+
+	if (arena != NULL) {
+		return arena;
+	}
+
+	/* During reentrancy, arena 0 is the safest bet. */
+	if (unlikely(tsd_reentrancy_level_get(tsd) > 0)) {
+		return arena_get(tsd_tsdn(tsd), 0, true);
+	}
+
+	ret = internal ? tsd_iarena_get(tsd) : tsd_arena_get(tsd);
+	if (unlikely(ret == NULL)) {
+		ret = arena_choose_hard(tsd, internal);
+		assert(ret);
+		if (tcache_available(tsd)) {
+			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
+			tcache_t      *tcache = tsd_tcachep_get(tsd);
+			if (tcache_slow->arena != NULL) {
+				/* See comments in tsd_tcache_data_init().*/
+				assert(tcache_slow->arena
+				    == arena_get(tsd_tsdn(tsd), 0, false));
+				if (tcache_slow->arena != ret) {
+					tcache_arena_reassociate(tsd_tsdn(tsd),
+					    tcache_slow, tcache, ret);
+				}
+			} else {
+				tcache_arena_associate(
+				    tsd_tsdn(tsd), tcache_slow, tcache, ret);
+			}
+		}
+	}
+
+	/*
+	 * Note that for percpu arena, if the current arena is outside of the
+	 * auto percpu arena range, (i.e. thread is assigned to a manually
+	 * managed arena), then percpu arena is skipped.
+	 */
+	if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)
+	    && !internal
+	    && (arena_ind_get(ret) < percpu_arena_ind_limit(opt_percpu_arena))
+	    && (ret->last_thd != tsd_tsdn(tsd))) {
+		unsigned ind = percpu_arena_choose();
+		if (arena_ind_get(ret) != ind) {
+			percpu_arena_update(tsd, ind);
+			ret = tsd_arena_get(tsd);
+		}
+		ret->last_thd = tsd_tsdn(tsd);
+	}
+
+	return ret;
+}
+
+static inline arena_t *
+arena_choose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, false);
+}
+
+static inline arena_t *
+arena_ichoose(tsd_t *tsd, arena_t *arena) {
+	return arena_choose_impl(tsd, arena, true);
+}
+
+static inline bool
+arena_is_auto(arena_t *arena) {
+	assert(narenas_auto > 0);
+
+	return (arena_ind_get(arena) < manual_arena_base);
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_B_H */
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@ -0,0 +1,600 @@
+#ifndef JEMALLOC_INTERNAL_INLINES_C_H
+#define JEMALLOC_INTERNAL_INLINES_C_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/hook.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/log.h"
+#include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/thread_event.h"
+#include "jemalloc/internal/witness.h"
+
+/*
+ * These correspond to the macros in jemalloc/jemalloc_macros.h.  Broadly, we
+ * should have one constant here per magic value there.  Note however that the
+ * representations need not be related.
+ */
+#define TCACHE_IND_NONE ((unsigned)-1)
+#define TCACHE_IND_AUTOMATIC ((unsigned)-2)
+#define ARENA_IND_AUTOMATIC ((unsigned)-1)
+
+/*
+ * Translating the names of the 'i' functions:
+ *   Abbreviations used in the first part of the function name (before
+ *   alloc/dalloc) describe what that function accomplishes:
+ *     a: arena (query)
+ *     s: size (query, or sized deallocation)
+ *     e: extent (query)
+ *     p: aligned (allocates)
+ *     vs: size (query, without knowing that the pointer is into the heap)
+ *     r: rallocx implementation
+ *     x: xallocx implementation
+ *   Abbreviations used in the second part of the function name (after
+ *   alloc/dalloc) describe the arguments it takes
+ *     z: whether to return zeroed memory
+ *     t: accepts a tcache_t * parameter
+ *     m: accepts an arena_t * parameter
+ */
+
+JEMALLOC_ALWAYS_INLINE arena_t *
+iaalloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+
+	return arena_aalloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+isalloc(tsdn_t *tsdn, const void *ptr) {
+	assert(ptr != NULL);
+
+	return arena_salloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iallocztm_explicit_slab(tsdn_t *tsdn, size_t size, szind_t ind, bool zero,
+    bool slab, tcache_t *tcache, bool is_internal, arena_t *arena,
+    bool slow_path) {
+	void *ret;
+
+	assert(!slab || sz_can_use_slab(size)); /* slab && large is illegal */
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena == NULL || arena_is_auto(arena));
+	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
+		witness_assert_depth_to_rank(
+		    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+	}
+
+	ret = arena_malloc(
+	    tsdn, arena, size, ind, zero, slab, tcache, slow_path);
+	if (config_stats && is_internal && likely(ret != NULL)) {
+		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
+    bool is_internal, arena_t *arena, bool slow_path) {
+	bool slab = sz_can_use_slab(size);
+	return iallocztm_explicit_slab(
+	    tsdn, size, ind, zero, slab, tcache, is_internal, arena, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
+	return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false,
+	    NULL, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment,
+    bool zero, bool slab, tcache_t *tcache, bool is_internal, arena_t *arena) {
+	void *ret;
+
+	assert(!slab || sz_can_use_slab(usize)); /* slab && large is illegal */
+	assert(usize != 0);
+	assert(usize == sz_sa2u(usize, alignment));
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena == NULL || arena_is_auto(arena));
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+
+	ret = arena_palloc(tsdn, arena, usize, alignment, zero, slab, tcache);
+	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
+	if (config_stats && is_internal && likely(ret != NULL)) {
+		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
+	}
+	return ret;
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_internal, arena_t *arena) {
+	return ipallocztm_explicit_slab(tsdn, usize, alignment, zero,
+	    sz_can_use_slab(usize), tcache, is_internal, arena);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, arena_t *arena) {
+	return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipalloct_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    bool slab, tcache_t *tcache, arena_t *arena) {
+	return ipallocztm_explicit_slab(
+	    tsdn, usize, alignment, zero, slab, tcache, false, arena);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) {
+	return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero,
+	    tcache_get(tsd), false, NULL);
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+ivsalloc(tsdn_t *tsdn, const void *ptr) {
+	return arena_vsalloc(tsdn, ptr);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
+    emap_alloc_ctx_t *alloc_ctx, bool is_internal, bool slow_path) {
+	assert(ptr != NULL);
+	assert(!is_internal || tcache == NULL);
+	assert(!is_internal || arena_is_auto(iaalloc(tsdn, ptr)));
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+	if (config_stats && is_internal) {
+		arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr));
+	}
+	if (!is_internal && !tsdn_null(tsdn)
+	    && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
+		assert(tcache == NULL);
+	}
+	arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+idalloc(tsd_t *tsd, void *ptr) {
+	idalloctm(tsd_tsdn(tsd), ptr, tcache_get(tsd), NULL, false, true);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
+    emap_alloc_ctx_t *alloc_ctx, bool slow_path) {
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+	arena_sdalloc(tsdn, ptr, size, tcache, alloc_ctx, slow_path);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+	void  *p;
+	size_t usize, copysize;
+
+	usize = sz_sa2u(size, alignment);
+	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
+		return NULL;
+	}
+	p = ipalloct_explicit_slab(
+	    tsdn, usize, alignment, zero, slab, tcache, arena);
+	if (p == NULL) {
+		return NULL;
+	}
+	/*
+	 * Copy at most size bytes (not size+extra), since the caller has no
+	 * expectation that the extra bytes will be reliably preserved.
+	 */
+	copysize = (size < oldsize) ? size : oldsize;
+	memcpy(p, ptr, copysize);
+	hook_invoke_alloc(
+	    hook_args->is_realloc ? hook_alloc_realloc : hook_alloc_rallocx, p,
+	    (uintptr_t)p, hook_args->args);
+	hook_invoke_dalloc(
+	    hook_args->is_realloc ? hook_dalloc_realloc : hook_dalloc_rallocx,
+	    ptr, hook_args->args);
+	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
+	return p;
+}
+
+/*
+ * is_realloc threads through the knowledge of whether or not this call comes
+ * from je_realloc (as opposed to je_rallocx); this ensures that we pass the
+ * correct entry point into any hooks.
+ * Note that these functions are all force-inlined, so no actual bool gets
+ * passed-around anywhere.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+iralloct_explicit_slab(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
+	assert(ptr != NULL);
+	assert(size != 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+
+	if (alignment != 0
+	    && ((uintptr_t)ptr & ((uintptr_t)alignment - 1)) != 0) {
+		/*
+		 * Existing object alignment is inadequate; allocate new space
+		 * and copy.
+		 */
+		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
+		    zero, slab, tcache, arena, hook_args);
+	}
+
+	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
+	    slab, tcache, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    size_t usize, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
+	bool slab = sz_can_use_slab(usize);
+	return iralloct_explicit_slab(tsdn, ptr, oldsize, size, alignment, zero,
+	    slab, tcache, arena, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    size_t usize, bool zero, hook_ralloc_args_t *hook_args) {
+	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, usize,
+	    zero, tcache_get(tsd), NULL, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
+    size_t alignment, bool zero, size_t *newsize) {
+	assert(ptr != NULL);
+	assert(size != 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+
+	if (alignment != 0
+	    && ((uintptr_t)ptr & ((uintptr_t)alignment - 1)) != 0) {
+		/* Existing object alignment is inadequate. */
+		*newsize = oldsize;
+		return true;
+	}
+
+	return arena_ralloc_no_move(
+	    tsdn, ptr, oldsize, size, extra, zero, newsize);
+}
+
+JEMALLOC_ALWAYS_INLINE void
+fastpath_success_finish(
+    tsd_t *tsd, uint64_t allocated_after, cache_bin_t *bin, void *ret) {
+	thread_allocated_set(tsd, allocated_after);
+	if (config_stats) {
+		bin->tstats.nrequests++;
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+malloc_initialized(void) {
+	return (malloc_init_state == malloc_init_initialized);
+}
+
+/*
+ * malloc() fastpath.  Included here so that we can inline it into operator new;
+ * function call overhead there is non-negligible as a fraction of total CPU in
+ * allocation-heavy C++ programs.  We take the fallback alloc to allow malloc
+ * (which can return NULL) to differ in its behavior from operator new (which
+ * can't).  It matches the signature of malloc / operator new so that we can
+ * tail-call the fallback allocator, allowing us to avoid setting up the call
+ * frame in the common case.
+ *
+ * Fastpath assumes size <= SC_LOOKUP_MAXCLASS, and that we hit
+ * tcache.  If either of these is false, we tail-call to the slowpath,
+ * malloc_default().  Tail-calling is used to avoid any caller-saved
+ * registers.
+ *
+ * fastpath supports ticker and profiling, both of which will also
+ * tail-call to the slowpath if they fire.
+ */
+JEMALLOC_ALWAYS_INLINE void *
+imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
+	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
+		return fallback_alloc(size);
+	}
+
+	tsd_t *tsd = tsd_get(false);
+	if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
+		return fallback_alloc(size);
+	}
+	/*
+	 * The code below till the branch checking the next_event threshold may
+	 * execute before malloc_init(), in which case the threshold is 0 to
+	 * trigger slow path and initialization.
+	 *
+	 * Note that when uninitialized, only the fast-path variants of the sz /
+	 * tsd facilities may be called.
+	 */
+	szind_t ind;
+	/*
+	 * The thread_allocated counter in tsd serves as a general purpose
+	 * accumulator for bytes of allocation to trigger different types of
+	 * events.  usize is always needed to advance thread_allocated, though
+	 * it's not always needed in the core allocation logic.
+	 */
+	size_t usize;
+	sz_size2index_usize_fastpath(size, &ind, &usize);
+	/* Fast path relies on size being a bin. */
+	assert(ind < SC_NBINS);
+	assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS)
+	    && (size <= SC_SMALL_MAXCLASS));
+
+	uint64_t allocated, threshold;
+	te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
+	uint64_t allocated_after = allocated + usize;
+	/*
+	 * The ind and usize might be uninitialized (or partially) before
+	 * malloc_init().  The assertions check for: 1) full correctness (usize
+	 * & ind) when initialized; and 2) guaranteed slow-path (threshold == 0)
+	 * when !initialized.
+	 */
+	if (!malloc_initialized()) {
+		assert(threshold == 0);
+	} else {
+		assert(ind == sz_size2index(size));
+		assert(usize > 0 && usize == sz_index2size(ind));
+	}
+	/*
+	 * Check for events and tsd non-nominal (fast_threshold will be set to
+	 * 0) in a single branch.
+	 */
+	if (unlikely(allocated_after >= threshold)) {
+		return fallback_alloc(size);
+	}
+	assert(tsd_fast(tsd));
+
+	tcache_t *tcache = tsd_tcachep_get(tsd);
+	assert(tcache == tcache_get(tsd));
+	cache_bin_t *bin = &tcache->bins[ind];
+	/* Suppress spurious warning from static analysis */
+	assert(bin != NULL);
+	bool  tcache_success;
+	void *ret;
+
+	/*
+	 * We split up the code this way so that redundant low-water
+	 * computation doesn't happen on the (more common) case in which we
+	 * don't touch the low water mark.  The compiler won't do this
+	 * duplication on its own.
+	 */
+	ret = cache_bin_alloc_easy(bin, &tcache_success);
+	if (tcache_success) {
+		fastpath_success_finish(tsd, allocated_after, bin, ret);
+		return ret;
+	}
+	ret = cache_bin_alloc(bin, &tcache_success);
+	if (tcache_success) {
+		fastpath_success_finish(tsd, allocated_after, bin, ret);
+		return ret;
+	}
+
+	return fallback_alloc(size);
+}
+
+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcache_get_from_ind(tsd_t *tsd, unsigned tcache_ind, bool slow, bool is_alloc) {
+	tcache_t *tcache;
+	if (tcache_ind == TCACHE_IND_AUTOMATIC) {
+		if (likely(!slow)) {
+			/* Getting tcache ptr unconditionally. */
+			tcache = tsd_tcachep_get(tsd);
+			assert(tcache == tcache_get(tsd));
+		} else if (is_alloc
+		    || likely(tsd_reentrancy_level_get(tsd) == 0)) {
+			tcache = tcache_get(tsd);
+		} else {
+			tcache = NULL;
+		}
+	} else {
+		/*
+                 * Should not specify tcache on deallocation path when being
+                 * reentrant.
+                 */
+		assert(is_alloc || tsd_reentrancy_level_get(tsd) == 0
+		    || tsd_state_nocleanup(tsd));
+		if (tcache_ind == TCACHE_IND_NONE) {
+			tcache = NULL;
+		} else {
+			tcache = tcaches_get(tsd, tcache_ind);
+		}
+	}
+	return tcache;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	if (config_opt_size_checks) {
+		emap_alloc_ctx_t dbg_ctx;
+		emap_alloc_ctx_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr, &dbg_ctx);
+		if (alloc_ctx->szind != dbg_ctx.szind) {
+			safety_check_fail_sized_dealloc(
+			    /* current_dealloc */ true, ptr,
+			    /* true_size */ emap_alloc_ctx_usize_get(&dbg_ctx),
+			    /* input_size */
+			    emap_alloc_ctx_usize_get(alloc_ctx));
+			return true;
+		}
+		if (alloc_ctx->slab != dbg_ctx.slab) {
+			safety_check_fail(
+			    "Internal heap corruption detected: "
+			    "mismatch in slab bit");
+			return true;
+		}
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_aligned(const void *ptr) {
+	return ((uintptr_t)ptr & PROF_SAMPLE_ALIGNMENT_MASK) == 0;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+free_fastpath_nonfast_aligned(void *ptr, bool check_prof) {
+	/*
+         * free_fastpath do not handle two uncommon cases: 1) sampled profiled
+         * objects and 2) sampled junk & stash for use-after-free detection.
+         * Both have special alignments which are used to escape the fastpath.
+         *
+         * prof_sample is page-aligned, which covers the UAF check when both
+         * are enabled (the assertion below).  Avoiding redundant checks since
+         * this is on the fastpath -- at most one runtime branch from this.
+         */
+	if (config_debug && cache_bin_nonfast_aligned(ptr)) {
+		assert(prof_sample_aligned(ptr));
+	}
+
+	if (config_prof && check_prof) {
+		/* When prof is enabled, the prof_sample alignment is enough. */
+		if (prof_sample_aligned(ptr)) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	if (config_uaf_detection) {
+		if (cache_bin_nonfast_aligned(ptr)) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	return false;
+}
+
+/* Returns whether or not the free attempt was successful. */
+JEMALLOC_ALWAYS_INLINE
+bool
+free_fastpath(void *ptr, size_t size, bool size_hint) {
+	tsd_t *tsd = tsd_get(false);
+	/* The branch gets optimized away unless tsd_get_allocates(). */
+	if (unlikely(tsd == NULL)) {
+		return false;
+	}
+	/*
+         *  The tsd_fast() / initialized checks are folded into the branch
+         *  testing (deallocated_after >= threshold) later in this function.
+         *  The threshold will be set to 0 when !tsd_fast.
+         */
+	assert(tsd_fast(tsd)
+	    || *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0);
+
+	emap_alloc_ctx_t alloc_ctx JEMALLOC_CC_SILENCE_INIT({0, 0, false});
+	size_t                     usize;
+	if (!size_hint) {
+		bool err = emap_alloc_ctx_try_lookup_fast(
+		    tsd, &arena_emap_global, ptr, &alloc_ctx);
+
+		/* Note: profiled objects will have alloc_ctx.slab set */
+		if (unlikely(err || !alloc_ctx.slab
+		        || free_fastpath_nonfast_aligned(ptr,
+		            /* check_prof */ false))) {
+			return false;
+		}
+		assert(alloc_ctx.szind != SC_NSIZES);
+		usize = sz_index2size(alloc_ctx.szind);
+	} else {
+		/*
+                 * Check for both sizes that are too large, and for sampled /
+                 * special aligned objects.  The alignment check will also check
+                 * for null ptr.
+                 */
+		if (unlikely(size > SC_LOOKUP_MAXCLASS
+		        || free_fastpath_nonfast_aligned(ptr,
+		            /* check_prof */ true))) {
+			return false;
+		}
+		sz_size2index_usize_fastpath(size, &alloc_ctx.szind, &usize);
+		/* Max lookup class must be small. */
+		assert(alloc_ctx.szind < SC_NBINS);
+		/* This is a dead store, except when opt size checking is on. */
+		alloc_ctx.slab = true;
+	}
+	/*
+         * Currently the fastpath only handles small sizes.  The branch on
+         * SC_LOOKUP_MAXCLASS makes sure of it.  This lets us avoid checking
+         * tcache szind upper limit (i.e. tcache_max) as well.
+         */
+	assert(alloc_ctx.slab);
+
+	uint64_t deallocated, threshold;
+	te_free_fastpath_ctx(tsd, &deallocated, &threshold);
+
+	uint64_t deallocated_after = deallocated + usize;
+	/*
+         * Check for events and tsd non-nominal (fast_threshold will be set to
+         * 0) in a single branch.  Note that this handles the uninitialized case
+         * as well (TSD init will be triggered on the non-fastpath).  Therefore
+         * anything depends on a functional TSD (e.g. the alloc_ctx sanity check
+         * below) needs to be after this branch.
+         */
+	if (unlikely(deallocated_after >= threshold)) {
+		return false;
+	}
+	assert(tsd_fast(tsd));
+	bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
+	if (fail) {
+		/* See the comment in isfree. */
+		return true;
+	}
+
+	tcache_t    *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
+	       /* slow */ false, /* is_alloc */ false);
+	cache_bin_t *bin = &tcache->bins[alloc_ctx.szind];
+
+	/*
+         * If junking were enabled, this is where we would do it.  It's not
+         * though, since we ensured above that we're on the fast path.  Assert
+         * that to double-check.
+         */
+	assert(!opt_junk_free);
+
+	if (!cache_bin_dalloc_easy(bin, ptr)) {
+		return false;
+	}
+
+	*tsd_thread_deallocatedp_get(tsd) = deallocated_after;
+
+	return true;
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_noflags(void *ptr, size_t size) {
+	if (!free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, 0);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_impl(void *ptr, size_t size, int flags) {
+	if (flags != 0 || !free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, flags);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_free_impl(void *ptr) {
+	if (!free_fastpath(ptr, 0, false)) {
+		free_default(ptr);
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_INLINES_C_H */
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@ -1,57 +1,146 @@
-/*
- * JEMALLOC_ALWAYS_INLINE and JEMALLOC_INLINE are used within header files for
- * functions that are static inline functions if inlining is enabled, and
- * single-definition library-private functions if inlining is disabled.
- *
- * JEMALLOC_ALWAYS_INLINE_C and JEMALLOC_INLINE_C are for use in .c files, in
- * which case the denoted functions are always static, regardless of whether
- * inlining is enabled.
- */
-#if defined(JEMALLOC_DEBUG) || defined(JEMALLOC_CODE_COVERAGE)
-   /* Disable inlining to make debugging/profiling easier. */
-#  define JEMALLOC_ALWAYS_INLINE
-#  define JEMALLOC_ALWAYS_INLINE_C static
-#  define JEMALLOC_INLINE
-#  define JEMALLOC_INLINE_C static
-#  define inline
+#ifndef JEMALLOC_INTERNAL_MACROS_H
+#define JEMALLOC_INTERNAL_MACROS_H
+
+#ifdef JEMALLOC_DEBUG
+#	define JEMALLOC_ALWAYS_INLINE static inline
 #else
-#  define JEMALLOC_ENABLE_INLINE
-#  ifdef JEMALLOC_HAVE_ATTR
-#    define JEMALLOC_ALWAYS_INLINE \
-	 static inline JEMALLOC_ATTR(unused) JEMALLOC_ATTR(always_inline)
-#    define JEMALLOC_ALWAYS_INLINE_C \
-	 static inline JEMALLOC_ATTR(always_inline)
-#  else
-#    define JEMALLOC_ALWAYS_INLINE static inline
-#    define JEMALLOC_ALWAYS_INLINE_C static inline
-#  endif
-#  define JEMALLOC_INLINE static inline
-#  define JEMALLOC_INLINE_C static inline
-#  ifdef _MSC_VER
-#    define inline _inline
-#  endif
+#	ifdef _MSC_VER
+#		define JEMALLOC_ALWAYS_INLINE static __forceinline
+#	else
+#		define JEMALLOC_ALWAYS_INLINE                                 \
+			JEMALLOC_ATTR(always_inline) static inline
+#	endif
+#endif
+#ifdef _MSC_VER
+#	define inline _inline
 #endif

-#ifdef JEMALLOC_CC_SILENCE
-#  define UNUSED JEMALLOC_ATTR(unused)
-#else
-#  define UNUSED
-#endif
+#define UNUSED JEMALLOC_ATTR(unused)

-#define	ZU(z)	((size_t)z)
-#define	ZI(z)	((ssize_t)z)
-#define	QU(q)	((uint64_t)q)
-#define	QI(q)	((int64_t)q)
+#define ZU(z) ((size_t)z)
+#define ZD(z) ((ssize_t)z)
+#define QU(q) ((uint64_t)q)
+#define QD(q) ((int64_t)q)

-#define	KZU(z)	ZU(z##ULL)
-#define	KZI(z)	ZI(z##LL)
-#define	KQU(q)	QU(q##ULL)
-#define	KQI(q)	QI(q##LL)
+#define KZU(z) ZU(z##ULL)
+#define KZD(z) ZD(z##LL)
+#define KQU(q) QU(q##ULL)
+#define KQD(q) QI(q##LL)

 #ifndef __DECONST
-#  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
+#	define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
 #endif

-#ifndef JEMALLOC_HAS_RESTRICT
-#  define restrict
+#if !defined(JEMALLOC_HAS_RESTRICT) || defined(__cplusplus)
+#	define restrict
 #endif
+
+/* Various function pointers are static and immutable except during testing. */
+#ifdef JEMALLOC_JET
+#	define JET_MUTABLE
+#	define JET_EXTERN extern
+#else
+#	define JET_MUTABLE const
+#	define JET_EXTERN static
+#endif
+
+#define JEMALLOC_VA_ARGS_HEAD(head, ...) head
+#define JEMALLOC_VA_ARGS_TAIL(head, ...) __VA_ARGS__
+
+/* Diagnostic suppression macros */
+#if defined(_MSC_VER) && !defined(__clang__)
+#	define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#	define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable : W))
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+/* #pragma GCC diagnostic first appeared in gcc 4.6. */
+#elif (defined(__GNUC__)                                                       \
+    && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))          \
+    || defined(__clang__)
+/*
+ * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
+ * diagnostic suppression macros and should not be used anywhere else.
+ */
+#	define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#	define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#	define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W)                                  \
+		JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+
+/*
+ * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
+ * all clang versions up to version 7 (currently trunk, unreleased).  This macro
+ * suppresses the warning for the affected compiler versions only.
+ */
+#	if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5))     \
+	    || defined(__clang__)
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \
+			JEMALLOC_DIAGNOSTIC_IGNORE(                                  \
+			    "-Wmissing-field-initializers")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	endif
+
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS                       \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wframe-address")
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS                         \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER                    \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#	if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN      \
+			JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	endif
+#	ifdef JEMALLOC_HAVE_ATTR_DEPRECATED
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED                  \
+			JEMALLOC_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	endif
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS                           \
+		JEMALLOC_DIAGNOSTIC_PUSH                                       \
+		JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#else
+#	define JEMALLOC_DIAGNOSTIC_PUSH
+#	define JEMALLOC_DIAGNOSTIC_POP
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#endif
+
+#ifdef __clang_analyzer__
+#	define JEMALLOC_CLANG_ANALYZER
+#endif
+
+#ifdef JEMALLOC_CLANG_ANALYZER
+#	define JEMALLOC_CLANG_ANALYZER_SUPPRESS __attribute__((suppress))
+#	define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v) = v
+#else
+#	define JEMALLOC_CLANG_ANALYZER_SUPPRESS
+#	define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v)
+#endif
+
+#define JEMALLOC_SUPPRESS_WARN_ON_USAGE(...)                                   \
+	JEMALLOC_DIAGNOSTIC_PUSH                                               \
+	JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED                                  \
+	__VA_ARGS__                                                            \
+	JEMALLOC_DIAGNOSTIC_POP
+
+/*
+ * Disables spurious diagnostics for all headers.  Since these headers are not
+ * included by users directly, it does not affect their diagnostic settings.
+ */
+JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+
+#endif /* JEMALLOC_INTERNAL_MACROS_H */
--- a/include/jemalloc/internal/jemalloc_internal_overrides.h
+++ b/include/jemalloc/internal/jemalloc_internal_overrides.h
@ -0,0 +1,22 @@
+#ifndef JEMALLOC_INTERNAL_OVERRIDES_H
+#define JEMALLOC_INTERNAL_OVERRIDES_H
+
+/*
+ * Under normal circumstances this header serves no purpose, as these settings
+ * can be customized via the corresponding autoconf options at configure-time.
+ * Overriding in this fashion is useful when the header files generated by
+ * autoconf are used as input for another build system.
+ */
+
+#ifdef JEMALLOC_OVERRIDE_LG_PAGE
+#	undef LG_PAGE
+#	define LG_PAGE JEMALLOC_OVERRIDE_LG_PAGE
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+#	undef JEMALLOC_CONFIG_MALLOC_CONF
+#	define JEMALLOC_CONFIG_MALLOC_CONF                                    \
+		JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+#endif
+
+#endif /* JEMALLOC_INTERNAL_OVERRIDES_H */
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@ -0,0 +1,148 @@
+#ifndef JEMALLOC_INTERNAL_TYPES_H
+#define JEMALLOC_INTERNAL_TYPES_H
+
+#include "jemalloc/internal/quantum.h"
+
+/* Processor / core id type. */
+typedef int malloc_cpuid_t;
+
+/* When realloc(non-null-ptr, 0) is called, what happens? */
+enum zero_realloc_action_e {
+	/* Realloc(ptr, 0) is free(ptr); return malloc(0); */
+	zero_realloc_action_alloc = 0,
+	/* Realloc(ptr, 0) is free(ptr); */
+	zero_realloc_action_free = 1,
+	/* Realloc(ptr, 0) aborts. */
+	zero_realloc_action_abort = 2
+};
+typedef enum zero_realloc_action_e zero_realloc_action_t;
+
+/* Signature of write callback. */
+typedef void(write_cb_t)(void *, const char *);
+
+enum malloc_init_e {
+	malloc_init_uninitialized = 3,
+	malloc_init_a0_initialized = 2,
+	malloc_init_recursible = 1,
+	malloc_init_initialized = 0 /* Common case --> jnz. */
+};
+typedef enum malloc_init_e malloc_init_t;
+
+/*
+ * Flags bits:
+ *
+ * a: arena
+ * t: tcache
+ * 0: unused
+ * z: zero
+ * n: alignment
+ *
+ * aaaaaaaa aaaatttt tttttttt 0znnnnnn
+ */
+#define MALLOCX_ARENA_BITS 12
+#define MALLOCX_TCACHE_BITS 12
+#define MALLOCX_LG_ALIGN_BITS 6
+#define MALLOCX_ARENA_SHIFT 20
+#define MALLOCX_TCACHE_SHIFT 8
+#define MALLOCX_ARENA_MASK                                                     \
+	((unsigned)(((1U << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT))
+/* NB: Arena index bias decreases the maximum number of arenas by 1. */
+#define MALLOCX_ARENA_LIMIT ((unsigned)((1U << MALLOCX_ARENA_BITS) - 1))
+#define MALLOCX_TCACHE_MASK                                                    \
+	((unsigned)(((1U << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT))
+#define MALLOCX_TCACHE_MAX ((unsigned)((1U << MALLOCX_TCACHE_BITS) - 3))
+#define MALLOCX_LG_ALIGN_MASK ((1 << MALLOCX_LG_ALIGN_BITS) - 1)
+/* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
+#define MALLOCX_ALIGN_GET_SPECIFIED(flags)                                     \
+	(ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
+#define MALLOCX_ALIGN_GET(flags)                                               \
+	(MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX - 1))
+#define MALLOCX_ZERO_GET(flags) ((bool)(flags & MALLOCX_ZERO))
+
+#define MALLOCX_TCACHE_GET(flags)                                              \
+	(((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT))   \
+	    - 2)
+#define MALLOCX_ARENA_GET(flags)                                               \
+	(((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1)
+
+/* Smallest size class to support. */
+#define TINY_MIN (1U << LG_TINY_MIN)
+
+#define LONG ((size_t)(1U << LG_SIZEOF_LONG))
+#define LONG_MASK (LONG - 1)
+
+/* Return the smallest long multiple that is >= a. */
+#define LONG_CEILING(a) (((a) + LONG_MASK) & ~LONG_MASK)
+
+#define SIZEOF_PTR (1U << LG_SIZEOF_PTR)
+#define PTR_MASK (SIZEOF_PTR - 1)
+
+/* Return the smallest (void *) multiple that is >= a. */
+#define PTR_CEILING(a) (((a) + PTR_MASK) & ~PTR_MASK)
+
+/*
+ * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
+ * In addition, this controls the spacing of cacheline-spaced size classes.
+ *
+ * CACHELINE cannot be based on LG_CACHELINE because __declspec(align()) can
+ * only handle raw constants.
+ */
+#define LG_CACHELINE 6
+#define CACHELINE 64
+#define CACHELINE_MASK (CACHELINE - 1)
+
+/* Return the smallest cacheline multiple that is >= s. */
+#define CACHELINE_CEILING(s) (((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
+
+/* Return the nearest aligned address at or below a. */
+#define ALIGNMENT_ADDR2BASE(a, alignment)                                      \
+	((void *)(((byte_t *)(a))                                              \
+	    - (((uintptr_t)(a)) - ((uintptr_t)(a) & ((~(alignment)) + 1)))))
+
+/* Return the offset between a and the nearest aligned address at or below a. */
+#define ALIGNMENT_ADDR2OFFSET(a, alignment)                                    \
+	((size_t)((uintptr_t)(a) & (alignment - 1)))
+
+/* Return the smallest alignment multiple that is >= s. */
+#define ALIGNMENT_CEILING(s, alignment)                                        \
+	(((s) + (alignment - 1)) & ((~(alignment)) + 1))
+
+/*
+ * Return the nearest aligned address at or above a.
+ *
+ * While at first glance this would appear to be merely a more complicated
+ * way to perform the same computation as `ALIGNMENT_CEILING`,
+ * this has the important additional property of not concealing pointer
+ * provenance from the compiler. See the block-comment on the
+ * definition of `byte_t` for more details.
+ */
+#define ALIGNMENT_ADDR2CEILING(a, alignment)                                   \
+	((void *)(((byte_t *)(a))                                              \
+	    + (((((uintptr_t)(a)) + (alignment - 1)) & ((~(alignment)) + 1))   \
+	        - ((uintptr_t)(a)))))
+
+/* Declare a variable-length array. */
+#if __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#	ifdef _MSC_VER
+#		include <malloc.h>
+#		define alloca _alloca
+#	else
+#		ifdef JEMALLOC_HAS_ALLOCA_H
+#			include <alloca.h>
+#		else
+#			include <stdlib.h>
+#		endif
+#	endif
+#	define VARIABLE_ARRAY_UNSAFE(type, name, count)                       \
+		type *name = alloca(sizeof(type) * (count))
+#else
+#	define VARIABLE_ARRAY_UNSAFE(type, name, count) type name[(count)]
+#endif
+#define VARIABLE_ARRAY_SIZE_MAX 2048
+#define VARIABLE_ARRAY(type, name, count)                                      \
+	assert(sizeof(type) * (count) <= VARIABLE_ARRAY_SIZE_MAX);             \
+	VARIABLE_ARRAY_UNSAFE(type, name, count)
+
+#define CALLOC_MADVISE_THRESHOLD_DEFAULT (((size_t)1) << 23) /* 8 MB */
+
+#endif /* JEMALLOC_INTERNAL_TYPES_H */
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@ -0,0 +1,286 @@
+#ifndef JEMALLOC_PREAMBLE_H
+#define JEMALLOC_PREAMBLE_H
+
+#include "jemalloc/internal/jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_decls.h"
+
+#if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
+#include <sys/ktrace.h>
+#  if defined(JEMALLOC_UTRACE)
+#    define UTRACE_CALL(p, l) utrace(p, l)
+#  else
+#    define UTRACE_CALL(p, l) utrace("jemalloc_process", p, l)
+#    define JEMALLOC_UTRACE
+#  endif
+#endif
+
+#define JEMALLOC_NO_DEMANGLE
+#ifdef JEMALLOC_JET
+#  undef JEMALLOC_IS_MALLOC
+#  define JEMALLOC_N(n) jet_##n
+#  include "jemalloc/internal/public_namespace.h"
+#  define JEMALLOC_NO_RENAME
+#  include "../jemalloc@install_suffix@.h"
+#  undef JEMALLOC_NO_RENAME
+#else
+#  define JEMALLOC_N(n) @private_namespace@##n
+#  include "../jemalloc@install_suffix@.h"
+#endif
+
+#if defined(JEMALLOC_OSATOMIC)
+#include <libkern/OSAtomic.h>
+#endif
+
+#ifdef JEMALLOC_ZONE
+#include <mach/mach_error.h>
+#include <mach/mach_init.h>
+#include <mach/vm_map.h>
+#endif
+
+#include "jemalloc/internal/jemalloc_internal_macros.h"
+
+/*
+ * Note that the ordering matters here; the hook itself is name-mangled.  We
+ * want the inclusion of hooks to happen early, so that we hook as much as
+ * possible.
+ */
+#ifndef JEMALLOC_NO_PRIVATE_NAMESPACE
+#  ifndef JEMALLOC_JET
+#    include "jemalloc/internal/private_namespace.h"
+#  else
+#    include "jemalloc/internal/private_namespace_jet.h"
+#  endif
+#endif
+#include "jemalloc/internal/test_hooks.h"
+
+#ifdef JEMALLOC_DEFINE_MADVISE_FREE
+#  define JEMALLOC_MADV_FREE 8
+#endif
+
+/*
+ * Can be defined at compile time, in cases, when it is known
+ * madvise(..., MADV_COLLAPSE) feature is supported, but MADV_COLLAPSE
+ * constant is not defined.
+ */
+#ifdef JEMALLOC_DEFINE_MADVISE_COLLAPSE
+#  define JEMALLOC_MADV_COLLAPSE 25
+#endif
+
+static const bool config_debug =
+#ifdef JEMALLOC_DEBUG
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_dss =
+#ifdef JEMALLOC_DSS
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_madvise_huge =
+#ifdef JEMALLOC_HAVE_MADVISE_HUGE
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_process_madvise =
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_fill =
+#ifdef JEMALLOC_FILL
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_lazy_lock =
+#ifdef JEMALLOC_LAZY_LOCK
+    true
+#else
+    false
+#endif
+    ;
+static const char * const config_malloc_conf = JEMALLOC_CONFIG_MALLOC_CONF;
+static const bool config_prof =
+#ifdef JEMALLOC_PROF
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libgcc =
+#ifdef JEMALLOC_PROF_LIBGCC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_libunwind =
+#ifdef JEMALLOC_PROF_LIBUNWIND
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_prof_frameptr =
+#ifdef JEMALLOC_PROF_FRAME_POINTER
+    true
+#else
+    false
+#endif
+    ;
+static const bool maps_coalesce =
+#ifdef JEMALLOC_MAPS_COALESCE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_stats =
+#ifdef JEMALLOC_STATS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_tls =
+#ifdef JEMALLOC_TLS
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_utrace =
+#ifdef JEMALLOC_UTRACE
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_xmalloc =
+#ifdef JEMALLOC_XMALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_cache_oblivious =
+#ifdef JEMALLOC_CACHE_OBLIVIOUS
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, for jemalloc development use only at the moment.  See the note
+ * in jemalloc/internal/log.h.
+ */
+static const bool config_log =
+#ifdef JEMALLOC_LOG
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Are extra safety checks enabled; things like checking the size of sized
+ * deallocations, double-frees, etc.
+ */
+static const bool config_opt_safety_checks =
+#ifdef JEMALLOC_OPT_SAFETY_CHECKS
+    true
+#elif defined(JEMALLOC_DEBUG)
+    /*
+     * This lets us only guard safety checks by one flag instead of two; fast
+     * checks can guard solely by config_opt_safety_checks and run in debug mode
+     * too.
+     */
+    true
+#else
+    false
+#endif
+    ;
+
+/*
+ * Extra debugging of sized deallocations too onerous to be included in the
+ * general safety checks.
+ */
+static const bool config_opt_size_checks =
+#if defined(JEMALLOC_OPT_SIZE_CHECKS) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool config_uaf_detection =
+#if defined(JEMALLOC_UAF_DETECTION) || defined(JEMALLOC_DEBUG)
+    true
+#else
+    false
+#endif
+    ;
+
+/* Whether or not the C++ extensions are enabled. */
+static const bool config_enable_cxx =
+#ifdef JEMALLOC_ENABLE_CXX
+    true
+#else
+    false
+#endif
+;
+
+#if defined(_WIN32) || defined(__APPLE__) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
+/* Currently percpu_arena depends on sched_getcpu. */
+#define JEMALLOC_PERCPU_ARENA
+#endif
+static const bool have_percpu_arena =
+#ifdef JEMALLOC_PERCPU_ARENA
+    true
+#else
+    false
+#endif
+    ;
+/*
+ * Undocumented, and not recommended; the application should take full
+ * responsibility for tracking provenance.
+ */
+static const bool force_ivsalloc =
+#ifdef JEMALLOC_FORCE_IVSALLOC
+    true
+#else
+    false
+#endif
+    ;
+static const bool have_background_thread =
+#ifdef JEMALLOC_BACKGROUND_THREAD
+    true
+#else
+    false
+#endif
+    ;
+static const bool config_high_res_timer =
+#ifdef JEMALLOC_HAVE_CLOCK_REALTIME
+    true
+#else
+    false
+#endif
+    ;
+
+static const bool have_memcntl =
+#ifdef JEMALLOC_HAVE_MEMCNTL
+    true
+#else
+    false
+#endif
+    ;
+
+#endif /* JEMALLOC_PREAMBLE_H */
--- a/include/jemalloc/internal/jemalloc_probe.h
+++ b/include/jemalloc/internal/jemalloc_probe.h
@ -0,0 +1,49 @@
+#ifndef JEMALLOC_INTERNAL_JEMALLOC_PROBE_H
+#define JEMALLOC_INTERNAL_JEMALLOC_PROBE_H
+
+#include <jemalloc/internal/jemalloc_preamble.h>
+
+#ifdef JEMALLOC_EXPERIMENTAL_USDT_STAP
+#include <jemalloc/internal/jemalloc_probe_stap.h>
+#elif defined(JEMALLOC_EXPERIMENTAL_USDT_CUSTOM)
+#include <jemalloc/internal/jemalloc_probe_custom.h>
+#elif defined(_MSC_VER)
+#define JE_USDT(name, N, ...) /* Nothing */
+#else /*  no USDT, just check the args */
+
+#define JE_USDT(name, N, ...) _JE_USDT_CHECK_ARG##N(__VA_ARGS__)
+
+#define _JE_USDT_CHECK_ARG1(a)						\
+	do {								\
+		(void)(a);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG2(a, b)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG3(a, b, c)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG4(a, b, c, d)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+		(void)(d);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG5(a, b, c, d, e)				\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+		(void)(d);						\
+		(void)(e);						\
+	} while (0)
+
+#endif /* JEMALLOC_EXPERIMENTAL_USDT_* */
+
+#endif /* JEMALLOC_INTERNAL_JEMALLOC_PROBE_H */
--- a/Show more
+++ b/Show more