Update ChangeLog for release 5.3.1

Fix opt.max_background_threads default in docs
Documentation updates (#2869 )
2026-04-14 22:51:50 +03:00 · 2026-04-13 17:12:37 -07:00 · 2026-04-13 14:46:53 -07:00 · 2026-04-07 10:41:44 -07:00 · 2026-04-01 23:15:19 -04:00 · 2026-04-01 23:15:19 -04:00
445 changed files with 42535 additions and 22437 deletions
--- a/.appveyor.yml
+++ b/.appveyor.yml
@ -9,6 +9,7 @@ environment:
  - MSYSTEM: MINGW64
    CPU: x86_64
    CONFIG_FLAGS: --enable-debug
+    EXTRA_CFLAGS: "-fcommon"
  - MSYSTEM: MINGW32
    CPU: i686
    MSVC: x86
@ -16,21 +17,30 @@ environment:
  - MSYSTEM: MINGW32
    CPU: i686
    CONFIG_FLAGS: --enable-debug
+    EXTRA_CFLAGS: "-fcommon"
  - MSYSTEM: MINGW64
    CPU: x86_64
    MSVC: amd64
+    CONFIG_FLAGS:
  - MSYSTEM: MINGW64
    CPU: x86_64
+    CONFIG_FLAGS:
+    EXTRA_CFLAGS: "-fcommon"
  - MSYSTEM: MINGW32
    CPU: i686
    MSVC: x86
+    CONFIG_FLAGS:
  - MSYSTEM: MINGW32
    CPU: i686
+    CONFIG_FLAGS:
+    EXTRA_CFLAGS: "-fcommon"

 install:
  - set PATH=c:\msys64\%MSYSTEM%\bin;c:\msys64\usr\bin;%PATH%
  - if defined MSVC call "c:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" %MSVC%
  - if defined MSVC pacman --noconfirm -Rsc mingw-w64-%CPU%-gcc gcc
+  - pacman --noconfirm -Syuu
+  - pacman --noconfirm -S autoconf

 build_script:
  - bash -c "autoconf"
--- a/.cirrus.yml
+++ b/.cirrus.yml
@ -1,46 +0,0 @@
-env:
-  CIRRUS_CLONE_DEPTH: 1
-  ARCH: amd64
-
-task:
-  matrix:
-      env:
-        DEBUG_CONFIG: --enable-debug
-      env:
-        DEBUG_CONFIG: --disable-debug
-  matrix:
-    - env:
-        PROF_CONFIG: --enable-prof
-    - env:
-        PROF_CONFIG: --disable-prof
-  matrix:
-    - name: 64-bit
-      env:
-        CC:
-        CXX:
-    - name: 32-bit
-      env:
-        CC: cc -m32
-        CXX: c++ -m32
-  matrix:
-    - env:
-        UNCOMMON_CONFIG:
-    - env:
-        UNCOMMON_CONFIG: --with-lg-page=16 --with-malloc-conf=tcache:false
-  freebsd_instance:
-    matrix:
-      image: freebsd-12-3-release-amd64
-  install_script:
-    - sed -i.bak -e 's,pkg+http://pkg.FreeBSD.org/\${ABI}/quarterly,pkg+http://pkg.FreeBSD.org/\${ABI}/latest,' /etc/pkg/FreeBSD.conf
-    - pkg upgrade -y
-    - pkg install -y autoconf gmake
-  script:
-    - autoconf
-    # We don't perfectly track freebsd stdlib.h definitions.  This is fine when
-    # we count as a system header, but breaks otherwise, like during these
-    # tests.
-    - ./configure --with-jemalloc-prefix=ci_ ${DEBUG_CONFIG} ${PROF_CONFIG} ${UNCOMMON_CONFIG}
-    - export JFLAG=`sysctl -n kern.smp.cpus`
-    - gmake -j${JFLAG}
-    - gmake -j${JFLAG} tests
-    - gmake check
--- a/.clang-format
+++ b/.clang-format
@ -4,10 +4,10 @@
 # AccessModifierOffset: -2
 AlignAfterOpenBracket: DontAlign
 AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
+AlignConsecutiveDeclarations: true
 AlignEscapedNewlines: Right
 AlignOperands: false
-AlignTrailingComments: false
+AlignTrailingComments: true
 AllowAllParametersOfDeclarationOnNextLine: true
 AllowShortBlocksOnASingleLine: false
 AllowShortCaseLabelsOnASingleLine: false
@ -20,16 +20,16 @@ AlwaysBreakBeforeMultilineStrings: true
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
-  AfterClass: false
-  AfterControlStatement: false
-  AfterEnum: false
-  AfterFunction: false
-  AfterNamespace: false
-  AfterObjCDeclaration: false
-  AfterStruct: false
-  AfterUnion: false
-  BeforeCatch: false
-  BeforeElse: false
+  AfterClass: true
+  AfterControlStatement: true
+  AfterEnum: true
+  AfterFunction: true
+  AfterNamespace: true
+  AfterObjCDeclaration: true
+  AfterStruct: true
+  AfterUnion: true
+  BeforeCatch: true
+  BeforeElse: true
  IndentBraces: false
 # BreakAfterJavaFieldAnnotations: true
 BreakBeforeBinaryOperators: NonAssignment
@ -43,7 +43,7 @@ ColumnLimit: 80
 # CompactNamespaces: true
 # ConstructorInitializerAllOnOneLineOrOnePerLine: true
 # ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 2
+ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
@ -57,7 +57,7 @@ ForEachMacros:   [ ql_foreach, qr_foreach, ]
 # IncludeIsMainRegex: ''
 IndentCaseLabels: false
 IndentPPDirectives: AfterHash
-IndentWidth: 4
+IndentWidth: 8
 IndentWrappedFunctionNames: false
 # JavaImportGroups: []
 # JavaScriptQuotes: Leave
@ -73,8 +73,8 @@ MaxEmptyLinesToKeep: 1
 # ObjCSpaceAfterProperty: false
 # ObjCSpaceBeforeProtocolList: false

-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakAssignment: 100
+PenaltyBreakBeforeFirstCallParameter: 100
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
 PenaltyBreakString: 1000
@ -96,7 +96,7 @@ PointerAlignment: Right
 #         - 'cpp'
 #       BasedOnStyle: llvm
 #       CanonicalDelimiter: 'cc'
-ReflowComments: true
+ReflowComments: false
 SortIncludes: false
 SpaceAfterCStyleCast: false
 # SpaceAfterTemplateKeyword: true
@ -107,7 +107,7 @@ SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 # SpaceBeforeRangeBasedForLoopColon: true
 SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
+SpacesBeforeTrailingComments: 1
 SpacesInAngles:  false
 SpacesInCStyleCastParentheses: false
 # SpacesInContainerLiterals: false
@ -118,5 +118,5 @@ SpacesInSquareBrackets: false
 # used by some of the core jemalloc developers.
 # StatementMacros: []
 TabWidth: 8
-UseTab: Never
+UseTab: ForIndentation
 ...
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -0,0 +1,2 @@
+554185356bf990155df8d72060c4efe993642baf
+34f359e0ca613b5f9d970e9b2152a5203c9df8d6
--- a/.github/workflows/check_formatting.yaml
+++ b/.github/workflows/check_formatting.yaml
@ -0,0 +1,10 @@
+name: 'Check Formatting'
+on: [pull_request]
+jobs:
+  check-formatting:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository
+        uses: actions/checkout@v4
+      - name: Check for trailing whitespace
+        run: scripts/check_trailing_whitespace.sh
--- a/.github/workflows/freebsd-ci.yml
+++ b/.github/workflows/freebsd-ci.yml
@ -0,0 +1,66 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: FreeBSD CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-freebsd:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        debug: ['--enable-debug', '--disable-debug']
+        prof: ['--enable-prof', '--disable-prof']
+        arch: ['64-bit', '32-bit']
+        uncommon:
+          - ''
+          - '--with-lg-page=16 --with-malloc-conf=tcache:false'
+
+    name: FreeBSD (${{ matrix.arch }}, debug=${{ matrix.debug }}, prof=${{ matrix.prof }}${{ matrix.uncommon && ', uncommon' || '' }})
+
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 1
+
+    - name: Test on FreeBSD
+      uses: vmactions/freebsd-vm@v1
+      with:
+        release: '15.0'
+        usesh: true
+        prepare: |
+          pkg install -y autoconf gmake
+        run: |
+          # Verify we're running in FreeBSD
+          echo "==== System Information ===="
+          uname -a
+          freebsd-version
+          echo "============================"
+
+          # Set compiler flags for 32-bit if needed
+          if [ "${{ matrix.arch }}" = "32-bit" ]; then
+            export CC="cc -m32"
+            export CXX="c++ -m32"
+          fi
+
+          # Generate configure script
+          autoconf
+
+          # Configure with matrix options
+          ./configure --with-jemalloc-prefix=ci_ ${{ matrix.debug }} ${{ matrix.prof }} ${{ matrix.uncommon }}
+
+          # Get CPU count for parallel builds
+          export JFLAG=$(sysctl -n kern.smp.cpus)
+
+          gmake -j${JFLAG}
+          gmake -j${JFLAG} tests
+          gmake check
+
+
+
--- a/.github/workflows/linux-ci.yml
+++ b/.github/workflows/linux-ci.yml
@ -0,0 +1,695 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: Linux CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-linux:
+    runs-on: ubuntu-24.04
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: clang
+              CXX: clang++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              COMPILER_FLAGS: -m32
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-stats"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --disable-stats"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --disable-libdl"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-stats --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --enable-opt-safety-checks"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--disable-libdl --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-lg-page=16"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-opt-safety-checks --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr --with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary,percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu,background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-debug --enable-experimental-smallocx --enable-stats --enable-prof"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== System Information ==="
+        uname -a
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== OS Release ==="
+        cat /etc/os-release || true
+        echo ""
+        echo "=== CPU Info ==="
+        lscpu | grep -E "Architecture|CPU op-mode|Byte Order|CPU\(s\):" || true
+
+    - name: Install dependencies (32-bit)
+      if: matrix.env.CROSS_COMPILE_32BIT == 'yes'
+      run: |
+        sudo dpkg --add-architecture i386
+        sudo apt-get update
+        sudo apt-get install -y gcc-multilib g++-multilib libc6-dev-i386
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC }}
+        CXX: ${{ matrix.env.CXX }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Verify the script generates the same output
+        ./scripts/gen_gh_actions.py > gh_actions_script.yml
+
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+  test-linux-arm64:
+    runs-on: ubuntu-24.04-arm
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: clang
+              CXX: clang++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-prof
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-lg-hugepage=29"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--enable-prof --enable-prof-frameptr"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=dss:primary"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=background_thread:true"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== System Information ==="
+        uname -a
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== OS Release ==="
+        cat /etc/os-release || true
+        echo ""
+        echo "=== CPU Info ==="
+        lscpu | grep -E "Architecture|CPU op-mode|Byte Order|CPU\(s\):" || true
+
+    - name: Install dependencies (32-bit)
+      if: matrix.env.CROSS_COMPILE_32BIT == 'yes'
+      run: |
+        sudo dpkg --add-architecture i386
+        sudo apt-get update
+        sudo apt-get install -y gcc-multilib g++-multilib libc6-dev-i386
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC }}
+        CXX: ${{ matrix.env.CXX }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Verify the script generates the same output
+        ./scripts/gen_gh_actions.py > gh_actions_script.yml
+
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+
--- a/.github/workflows/macos-ci.yml
+++ b/.github/workflows/macos-ci.yml
@ -0,0 +1,212 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: macOS CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-macos:
+    runs-on: macos-15-intel
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== macOS Version ==="
+        sw_vers
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== CPU Info ==="
+        sysctl -n machdep.cpu.brand_string
+        sysctl -n hw.machine
+
+    - name: Install dependencies
+      run: |
+        brew install autoconf
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+  test-macos-arm64:
+    runs-on: macos-15
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-stats
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --disable-libdl
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-opt-safety-checks
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --with-lg-page=16
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-lg-page=16 --with-lg-hugepage=29"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=tcache:false"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: "--with-malloc-conf=percpu_arena:percpu"
+              EXTRA_CFLAGS: "-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      run: |
+        echo "=== macOS Version ==="
+        sw_vers
+        echo ""
+        echo "=== Architecture ==="
+        uname -m
+        arch
+        echo ""
+        echo "=== CPU Info ==="
+        sysctl -n machdep.cpu.brand_string
+        sysctl -n hw.machine
+
+    - name: Install dependencies
+      run: |
+        brew install autoconf
+
+    - name: Build and test
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build
+        make -j3
+        make -j3 tests
+
+        # Run tests
+        make check
+
+
+
--- a/.github/workflows/static_analysis.yaml
+++ b/.github/workflows/static_analysis.yaml
@ -0,0 +1,68 @@
+name: 'Static Analysis'
+on: [pull_request]
+jobs:
+  static-analysis:
+    runs-on: ubuntu-latest
+    steps:
+      # We build libunwind ourselves because sadly the version
+      # provided by Ubuntu via apt-get is much too old.
+      - name: Check out libunwind
+        uses: actions/checkout@v4
+        with:
+          repository: libunwind/libunwind
+          path: libunwind
+          ref: 'v1.6.2'
+          github-server-url: 'https://github.com'
+      - name: Install libunwind
+        run: |
+          cd libunwind
+          autoreconf -i
+          ./configure --prefix=/usr
+          make -s -j $(nproc) V=0
+          sudo make -s install V=0
+          cd ..
+          rm -rf libunwind
+      - name: Check out repository
+        uses: actions/checkout@v4
+      # We download LLVM directly from the latest stable release
+      # on GitHub, because this tends to be much newer than the
+      # version available via apt-get in Ubuntu.
+      - name: Download LLVM
+        uses: dsaltares/fetch-gh-release-asset@master
+        with:
+          repo: 'llvm/llvm-project'
+          version: 'tags/llvmorg-16.0.4'
+          file: 'clang[+]llvm-.*x86_64-linux-gnu.*'
+          regex: true
+          target: 'llvm_assets/'
+          token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Install prerequisites
+        id: install_prerequisites
+        run: |
+          tar -C llvm_assets -xaf llvm_assets/*.tar* &
+          sudo apt-get update
+          sudo apt-get install -y jq bear python3-pip
+          pip install codechecker
+          echo "Extracting LLVM from tar" 1>&2
+          wait
+          echo "LLVM_BIN_DIR=$(echo llvm_assets/clang*/bin)" >> "$GITHUB_OUTPUT"
+      - name: Run static analysis
+        id: run_static_analysis
+        run: >
+          PATH="${{ steps.install_prerequisites.outputs.LLVM_BIN_DIR }}:$PATH"
+          LDFLAGS='-L/usr/lib'
+          scripts/run_static_analysis.sh static_analysis_results "$GITHUB_OUTPUT"
+      - name: Upload static analysis results
+        if: ${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }} == '1'
+        uses: actions/upload-artifact@v4
+        with:
+          name: static_analysis_results
+          path: static_analysis_results
+      - name: Check static analysis results
+        run: |
+          if [[ "${{ steps.run_static_analysis.outputs.HAS_STATIC_ANALYSIS_RESULTS }}" == '1' ]]
+          then
+              echo "::error::Static analysis found issues with your code. Download the 'static_analysis_results' artifact from this workflow and view the 'index.html' file contained within it in a web browser locally for detailed results."
+              exit 1
+          fi
+
--- a/.github/workflows/windows-ci.yml
+++ b/.github/workflows/windows-ci.yml
@ -0,0 +1,155 @@
+# This config file is generated by ./scripts/gen_gh_actions.py.
+# Do not edit by hand.
+
+name: Windows CI
+
+on:
+  push:
+    branches: [ dev, ci_travis ]
+  pull_request:
+    branches: [ dev ]
+
+jobs:
+  test-windows:
+    runs-on: windows-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - env:
+              CC: gcc
+              CXX: g++
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: gcc
+              CXX: g++
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CONFIGURE_FLAGS: --enable-debug
+          - env:
+              CC: gcc
+              CXX: g++
+              CROSS_COMPILE_32BIT: yes
+              CONFIGURE_FLAGS: --enable-debug
+              EXTRA_CFLAGS: -fcommon
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CROSS_COMPILE_32BIT: yes
+          - env:
+              CC: cl.exe
+              CXX: cl.exe
+              CROSS_COMPILE_32BIT: yes
+              CONFIGURE_FLAGS: --enable-debug
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Show OS version
+      shell: cmd
+      run: |
+        echo === Windows Version ===
+        systeminfo | findstr /B /C:"OS Name" /C:"OS Version"
+        ver
+        echo.
+        echo === Architecture ===
+        echo PROCESSOR_ARCHITECTURE=%PROCESSOR_ARCHITECTURE%
+        echo.
+
+    - name: Setup MSYS2
+      uses: msys2/setup-msys2@v2
+      with:
+        msystem: ${{ matrix.env.CROSS_COMPILE_32BIT == 'yes' && 'MINGW32' || 'MINGW64' }}
+        update: true
+        install: >-
+          autotools
+          git
+        pacboy: >-
+          make:p
+          gcc:p
+          binutils:p
+
+    - name: Build and test (MinGW-GCC)
+      if: matrix.env.CC != 'cl.exe'
+      shell: msys2 {0}
+      env:
+        CC: ${{ matrix.env.CC || 'gcc' }}
+        CXX: ${{ matrix.env.CXX || 'g++' }}
+        COMPILER_FLAGS: ${{ matrix.env.COMPILER_FLAGS }}
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        EXTRA_CFLAGS: ${{ matrix.env.EXTRA_CFLAGS }}
+      run: |
+        # Run autoconf
+        autoconf
+
+        # Configure with flags
+        if [ -n "$COMPILER_FLAGS" ]; then
+          ./configure CC="${CC} ${COMPILER_FLAGS}" CXX="${CXX} ${COMPILER_FLAGS}" $CONFIGURE_FLAGS
+        else
+          ./configure $CONFIGURE_FLAGS
+        fi
+
+        # Build (mingw32-make is the "make" command in MSYS2)
+        mingw32-make -j3
+        mingw32-make tests
+
+        # Run tests
+        mingw32-make -k check
+
+    - name: Setup MSVC environment
+      if: matrix.env.CC == 'cl.exe'
+      uses: ilammy/msvc-dev-cmd@v1
+      with:
+        arch: ${{ matrix.env.CROSS_COMPILE_32BIT == 'yes' && 'x86' || 'x64' }}
+
+    - name: Build and test (MSVC)
+      if: matrix.env.CC == 'cl.exe'
+      shell: msys2 {0}
+      env:
+        CONFIGURE_FLAGS: ${{ matrix.env.CONFIGURE_FLAGS }}
+        MSYS2_PATH_TYPE: inherit
+      run: |
+        # Export MSVC environment variables for configure
+        export CC=cl.exe
+        export CXX=cl.exe
+        export AR=lib.exe
+        export NM=dumpbin.exe
+        export RANLIB=:
+
+        # Verify cl.exe is accessible (should be in PATH via inherit)
+        if ! which cl.exe > /dev/null 2>&1; then
+          echo "cl.exe not found, trying to locate MSVC..."
+          # Find and add MSVC bin directory to PATH
+          MSVC_BIN=$(cmd.exe /c "echo %VCToolsInstallDir%" | tr -d '\\r' | sed 's/\\\\\\\\/\//g' | sed 's/C:/\\/c/g')
+          if [ -n "$MSVC_BIN" ]; then
+            export PATH="$PATH:$MSVC_BIN/bin/Hostx64/x64:$MSVC_BIN/bin/Hostx86/x86"
+          fi
+        fi
+
+        # Run autoconf
+        autoconf
+
+        # Configure with MSVC
+        ./configure CC=cl.exe CXX=cl.exe AR=lib.exe $CONFIGURE_FLAGS
+
+        # Build (mingw32-make is the "make" command in MSYS2)
+        mingw32-make -j3
+        # Build tests sequentially due to PDB file issues
+        mingw32-make tests
+
+        # Run tests
+        mingw32-make -k check
+
+
+
--- a/.gitignore
+++ b/.gitignore
@ -45,6 +45,13 @@
 /src/*.[od]
 /src/*.sym

+# These are semantically meaningful for clangd and related tooling.
+/build/
+/.cache/
+compile_commands.json
+/static_analysis_raw_results
+/static_analysis_results
+
 /run_tests.out/

 /test/test.sh
@ -66,6 +73,7 @@ test/include/test/jemalloc_test_defs.h

 /test/stress/[A-Za-z]*
 !/test/stress/[A-Za-z]*.*
+!/test/stress/pa/
 /test/stress/*.[od]
 /test/stress/*.out

--- a/.travis.yml
+++ b/.travis.yml
@ -6,82 +6,10 @@
 # Differences are explained here:
 # https://docs.travis-ci.com/user/languages/minimal-and-generic/
 language: minimal
-dist: focal
+dist: jammy

 jobs:
  include:
-    - os: windows
-      arch: amd64
-      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-fcommon"
-    - os: windows
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-fcommon"
-    - os: windows
-      arch: amd64
-      env: CC=cl.exe CXX=cl.exe
-    - os: windows
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes EXTRA_CFLAGS="-fcommon"
-    - os: windows
-      arch: amd64
-      env: CC=cl.exe CXX=cl.exe CONFIGURE_FLAGS="--enable-debug"
-    - os: windows
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-fcommon"
-    - os: windows
-      arch: amd64
-      env: CC=cl.exe CXX=cl.exe CROSS_COMPILE_32BIT=yes
-    - os: windows
-      arch: amd64
-      env: CC=cl.exe CXX=cl.exe CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-libunwind"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-libunwind"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-libunwind --with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-prof --enable-prof-libunwind"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-libunwind --with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-libunwind"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug --with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-prof --enable-prof-libunwind --with-lg-page=16 --with-malloc-conf=tcache:false"
-    - os: freebsd
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-libunwind --with-lg-page=16 --with-malloc-conf=tcache:false"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -109,6 +37,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -142,6 +73,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: amd64
+      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
    - os: linux
      arch: amd64
      env: CC=clang CXX=clang++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
@ -172,6 +106,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes COMPILER_FLAGS="-m32" CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -199,6 +136,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -223,6 +163,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -244,6 +187,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -262,6 +208,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -277,6 +226,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -289,6 +241,9 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -301,6 +256,18 @@ jobs:
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: amd64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr --with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false,dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
@ -320,62 +287,47 @@ jobs:
      arch: amd64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu,background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
+      env: CC=clang CXX=clang++ EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes"
+    - os: linux
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16 --with-lg-hugepage=29" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
+      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-prof --enable-prof-frameptr" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
+    - os: linux
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=dss:primary" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=percpu_arena:percpu" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
    - os: linux
-      arch: ppc64le
+      arch: arm64
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=background_thread:true" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CROSS_COMPILE_32BIT=yes EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-stats" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--disable-libdl" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-opt-safety-checks" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-lg-page=16" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
-    - os: osx
-      arch: amd64
-      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--with-malloc-conf=tcache:false" EXTRA_CFLAGS="-Werror -Wno-array-bounds -Wno-unknown-warning-option -Wno-ignored-attributes -Wno-deprecated-declarations"
    # Development build
    - os: linux
      env: CC=gcc CXX=g++ CONFIGURE_FLAGS="--enable-debug --disable-cache-oblivious --enable-stats --enable-log --enable-prof" EXTRA_CFLAGS="-Werror -Wno-array-bounds"
--- a/148
+++ b/148
@ -4,6 +4,154 @@ brevity.  Much more detail can be found in the git revision history:

    https://github.com/jemalloc/jemalloc

+* 5.3.1 (Apr 13, 2026)
+
+This release includes over 390 commits spanning bug fixes, new features,
+performance optimizations, and portability improvements.  Multiple percent
+of system-level metric improvements were measured in tested production
+workloads.  The release has gone through large-scale production testing
+at Meta.
+
+New features:
+  - Support pvalloc.  (@Lapenkov: 5b1f2cc5)
+  - Add double free detection for the debug build.  (@izaitsevfb:
+    36366f3c, @guangli-dai: 42daa1ac, @divanorama: 1897f185)
+  - Add compile-time option `--enable-pageid` to enable memory mapping
+    annotation.  (@devnexen: 4fc5c4fb)
+  - Add runtime option `prof_bt_max` to control the max stack depth for
+    profiling.  (@guangli-dai: a0734fd6)
+  - Add compile-time option `--enable-force-getenv` to use `getenv` instead
+    of `secure_getenv`.  (@interwq: 481bbfc9)
+  - Add compile-time option `--disable-dss` to disable the usage of
+    `sbrk(2)`.  (@Svetlitski: ea5b7bea)
+  - Add runtime option `tcache_ncached_max` to control the number of items
+    in each size bin in the thread cache.  (@guangli-dai: 8a22d10b)
+  - Add runtime option `calloc_madvise_threshold` to determine if kernel or
+    memset is used to zero the allocations for calloc.  (@nullptr0-0:
+    5081c16b)
+  - Add compile-time option `--disable-user-config` to disable reading the
+    runtime configurations from `/etc/malloc.conf` or environment variable
+    `MALLOC_CONF`.  (@roblabla: c17bf8b3)
+  - Add runtime option `disable_large_size_classes` to guard the new usable
+    size calculation, which minimizes the memory overhead for large
+    allocations, i.e., >= 4 * PAGE.  (@guangli-dai: c067a55c, 8347f104)
+  - Enable process_madvise usage, add runtime option
+    `process_madvise_max_batch` to control the max # of regions in each
+    madvise batch.  (@interwq: 22440a02, @spredolac: 4246475b)
+  - Add mallctl interfaces:
+    + `opt.prof_bt_max`  (@guangli-dai: a0734fd6)
+    + `arena.<i>.name` to set and get arena names.  (@guangli-dai: ba19d2cb)
+    + `thread.tcache.max` to set and get the `tcache_max` of the current
+      thread.  (@guangli-dai: a442d9b8)
+    + `thread.tcache.ncached_max.write` and
+      `thread.tcache.ncached_max.read_sizeclass` to set and get the
+      `ncached_max` setup of the current thread.  (@guangli-dai: 630f7de9,
+      6b197fdd)
+    + `arenas.hugepage` to return the hugepage size used, also exported to
+      malloc stats.  (@ilvokhin: 90c627ed)
+    + `approximate_stats.active` to return an estimate of the current active
+      bytes, which should not be compared with other stats retrieved.
+      (@guangli-dai: 0988583d)
+
+Bug fixes:
+  - Prevent potential deadlocks in decaying during reentrancy.  (@interwq:
+    434a68e2)
+  - Fix segfault in extent coalescing.  (@Svetlitski: 12311fe6)
+  - Add null pointer detections in mallctl calls.  (@Svetlitski: dc0a184f,
+    0288126d)
+  - Make mallctl `arenas.lookup` triable without crashing on invalid
+    pointers.  (@auxten: 019cccc2, 5bac3849)
+  - Demote sampled allocations for proper deallocations during
+    `arena_reset`.  (@Svetlitski: 62648c88)
+  - Fix jemalloc's `read(2)` and `write(2)`.  (@Svetlitski: d2c9ed3d, @lexprfuncall:
+    9fdc1160)
+  - Fix the pkg-config metadata file.  (@BtbN: ed7e6fe7, ce8ce99a)
+  - Fix the autogen.sh so that it accepts quoted extra options.
+    (@honggyukim: f6fe6abd)
+  - Fix `rallocx()` to set errno to ENOMEM upon OOMing.  (@arter97: 38056fea,
+    @interwq: 83b07578)
+  - Avoid stack overflow for internal variable array usage.  (@nullptr0-0:
+    47c9bcd4, 48f66cf4, @xinydev: 9169e927)
+  - Fix background thread initialization race.  (@puzpuzpuz: 4d0ffa07)
+  - Guard os_page_id against a NULL address.  (@lexprfuncall: 79cc7dcc)
+  - Handle tcache init failures gracefully.  (@lexprfuncall: a056c20d)
+  - Fix missing release of acquired neighbor edata in
+    extent_try_coalesce_impl.  (@spredolac: 675ab079)
+  - Fix memory leak of old curr_reg on san_bump_grow_locked failure.
+    (@spredolac: 5904a421)
+  - Fix large alloc nrequests under-counting on cache misses.  (@spredolac:
+    3cc56d32)
+
+Portability improvements:
+  - Fix the build in C99.  (@abaelhe: 56ddbea2)
+  - Add `pthread_setaffinity_np` detection for non Linux/BSD platforms.
+    (@devnexen: 4c95c953)
+  - Make `VARIABLE_ARRAY` compatible with compilers not supporting VLA,
+    i.e., Visual Studio C compiler in C11 or C17 modes.  (@madscientist:
+    be65438f)
+  - Fix the build on Linux using musl library.  (@marv: aba1645f, 45249cf5)
+  - Reduce the memory overhead in small allocation sampling for systems
+    with larger page sizes, e.g., ARM.  (@Svetlitski: 5a858c64)
+  - Add C23's `free_sized` and `free_aligned_sized`.  (@Svetlitski:
+    cdb2c0e0)
+  - Enable heap profiling on MacOS.  (@nullptr0-0: 4b555c11)
+  - Fix incorrect printing on 32bit.  (@sundb: 630434bb)
+  - Make `JEMALLOC_CXX_THROW` compatible with C++ versions newer than
+    C++17.  (@r-barnes, @guangli-dai: 21bcc0a8)
+  - Fix mmap tag conflicts on MacOS.  (@kdrag0n: c893fcd1)
+  - Fix monotonic timer assumption for win32.  (@burtonli: 8dc97b11)
+  - Fix VM over-reservation on systems with larger pages, e.g., aarch64.
+    (@interwq: cd05b19f)
+  - Remove `unreachable()` macro conditionally to prevent definition
+    conflicts for C23+.  (@appujee: d8486b26, 4b88bddb)
+  - Fix dlsym failure observed on FreeBSD.  (@rhelmot: 86bbabac)
+  - Change the default page size to 64KB on aarch64 Linux.  (@lexprfuncall:
+    9442300c)
+  - Update config.guess and config.sub to the latest version.
+    (@lexprfuncall: c51949ea)
+  - Determine the page size on Android from NDK header files.
+    (@lexprfuncall: c51abba1)
+  - Improve the portability of grep patterns in configure.ac.
+    (@lexprfuncall: 365747bc)
+  - Add compile-time option `--with-cxx-stdlib` to specify the C++ standard
+    library.  (@yuxuanchen1997: a10ef3e1)
+
+Optimizations and refactors:
+  - Enable tcache for deallocation-only threads.  (@interwq: 143e9c4a)
+  - Inline to accelerate operator delete.  (@guangli-dai: e8f9f138)
+  - Optimize pairing heap's performance.  (@deadalnix: 5266152d, be6da4f6,
+    543e2d61, 10d71315, 92aa52c0, @Svetlitski: 36ca0c1b)
+  - Inline the storage for thread name in the profiling data.  (@interwq:
+    ce0b7ab6, e62aa478)
+  - Optimize a hot function `edata_cmp_summary_comp` to accelerate it.
+    (@Svetlitski: 6841110b, @guangli-dai: 0181aaa4)
+  - Allocate thread cache using the base allocator, which enables thread
+    cache to use thp when `metadata_thp` is turned on.  (@interwq:
+    72cfdce7)
+  - Allow oversize arena not to purge immediately when background threads
+    are enabled, although the default decay time is 0 to be back compatible.
+    (@interwq: d1313313)
+  - Optimize thread-local storage implementation on Windows.  (@mcfi:
+    9e123a83, 3a0d9cda)
+  - Optimize fast path to allow static size class computation.  (@interwq:
+    323ed2e3)
+  - Redesign tcache GC to regulate the frequency and make it
+    locality-aware. The new design is default on, guarded by option
+    `experimental_tcache_gc`.  (@nullptr0-0: 0c88be9e, e2c9f3a9,
+    14d5dc13, @deadalnix: 5afff2e4)
+  - Reduce the arena switching overhead by avoiding forced purging when
+    background thread is enabled.  (@interwq: a3910b98)
+  - Improve the reuse efficiency by limiting the maximum coalesced size for
+    large extents.  (@jiebinn: 3c14707b)
+  - Refactor thread events to allow registration of users' thread events
+    and remove prof_threshold as the built-in event.  (@spredolac: e6864c60,
+    015b0179, 34ace916)
+
+Documentation:
+  - Update Windows building instructions.  (@Lapenkov: 37139328)
+  - Add vcpkg installation instructions.  (@LilyWangLL: c0c9783e)
+  - Update profiling internals with an example.  (@jordalgo: b04e7666)
+
 * 5.3.0 (May 6, 2022)

  This release contains many speed and space optimizations, from micro
--- a/INSTALL.md
+++ b/INSTALL.md
@ -139,6 +139,7 @@ any of the following arguments (not a definitive list) to 'configure':
    in the following list that appears to function correctly:

    + libunwind      (requires --enable-prof-libunwind)
+    + frame pointer  (requires --enable-prof-frameptr)
    + libgcc         (unless --disable-prof-libgcc)
    + gcc intrinsics (unless --disable-prof-gcc)

@ -147,6 +148,12 @@ any of the following arguments (not a definitive list) to 'configure':
    Use the libunwind library (http://www.nongnu.org/libunwind/) for stack
    backtracing.

+* `--enable-prof-frameptr`
+
+    Use the optimized frame pointer unwinder for stack backtracing. Safe
+    to use in mixed code (with and without frame pointers) - but requires
+    frame pointers to produce meaningful stacks. Linux only.
+
 * `--disable-prof-libgcc`

    Disable the use of libgcc's backtracing functionality.
@ -315,13 +322,13 @@ behavior:
    'configure' uses this to find programs.

 In some cases it may be necessary to work around configuration results that do
-not match reality.  For example, Linux 4.5 added support for the MADV_FREE flag
-to madvise(2), which can cause problems if building on a host with MADV_FREE
-support and deploying to a target without.  To work around this, use a cache
-file to override the relevant configuration variable defined in configure.ac,
-e.g.:
+not match reality.  For example, Linux 3.4 added support for the MADV_DONTDUMP
+flag to madvise(2), which can cause problems if building on a host with
+MADV_DONTDUMP support and deploying to a target without.  To work around this,
+use a cache file to override the relevant configuration variable defined in
+configure.ac, e.g.:

-    echo "je_cv_madv_free=no" > config.cache && ./configure -C
+    echo "je_cv_madv_dontdump=no" > config.cache && ./configure -C


 ## Advanced compilation
@ -396,6 +403,102 @@ exclusively):

    Use this to search for programs used during configuration and building.

+## Building for Windows
+
+There are at least two ways to build jemalloc's libraries for Windows. They
+differ in their ease of use and flexibility.
+
+### With MSVC solutions
+This is the easy, but less flexible approach. It doesn't let you specify
+arguments to the `configure` script.
+  
+1. Install Cygwin with at least the following packages:
+   * autoconf
+   * autogen
+   * gawk
+   * grep
+   * sed
+
+2. Install Visual Studio 2015 or 2017 with Visual C++
+
+3. Add Cygwin\bin to the PATH environment variable
+
+4. Open "x64 Native Tools Command Prompt for VS 2017"
+   (note: x86/x64 doesn't matter at this point)
+
+5. Generate header files:
+   sh -c "CC=cl ./autogen.sh"
+
+6. Now the project can be opened and built in Visual Studio:
+   msvc\jemalloc_vc2017.sln
+
+### With MSYS
+This is a more involved approach that offers the same configuration flexibility
+as Linux builds. We use it for our CI workflow to test different jemalloc
+configurations on Windows.
+
+1. Install the prerequisites
+    1. MSYS2
+    2. Chocolatey
+    3. Visual Studio if you want to compile with MSVC compiler
+
+2. Run your bash emulation. It could be MSYS2 or Git Bash (this manual was
+   tested on both)
+3. Manually and selectively follow
+   [before_install.sh](https://github.com/jemalloc/jemalloc/blob/dev/scripts/windows/before_install.sh)
+   script.
+    1. Skip the `TRAVIS_OS_NAME` check, `rm -rf C:/tools/msys64` and `choco
+       uninstall/upgrade` part.
+    2.  If using `msys2` shell, add path to `RefreshEnv.cmd` to `PATH`:
+        `PATH="$PATH:/c/ProgramData/chocolatey/bin"`
+    3. Assign `msys_shell_cmd`, `msys2`, `mingw32` and `mingw64` as in the
+       script.
+    4. Pick `CROSS_COMPILE_32BIT` , `CC` and `USE_MSVC` values depending on
+       your needs. For instance, if you'd like to build for x86_64 Windows
+       with `gcc`, then `CROSS_COMPILE_32BIT="no"`, `CC="gcc"` and
+       `USE_MSVC=""`. If you'd like to build for x86 Windows with `cl.exe`,
+       then `CROSS_COMPILE_32BIT="yes"`, `CC="cl.exe"`, `USE_MSVC="x86"`.
+       For x86_64 builds with `cl.exe`, assign `USE_MSVC="amd64"` and
+       `CROSS_COMPILE_32BIT="no"`.
+    5. Replace the path to `vcvarsall.bat` with the path on your system. For
+       instance, on my Windows PC with Visual Studio 17, the path is
+       `C:\Program Files (x86)\Microsoft Visual
+       Studio\2017\BuildTools\VC\Auxiliary\Build\vcvarsall.bat`.
+    6. Execute the rest of the script. It will install the required
+       dependencies and assign the variable `build_env`, which is a function
+       that executes following commands with the correct environment
+       variables set.
+4. Use `$build_env <command>` as you would in a Linux shell:
+     1. `$build_env autoconf`
+     2. `$build_env ./configure CC="<desired compiler>" <configuration flags>`
+     3. `$build_env mingw32-make`
+
+If you're having any issues with the above, ensure the following:
+
+5. When you run `cmd //C RefreshEnv.cmd`, you get an output line starting with
+   `Refreshing` . If it errors saying `RefreshEnv.cmd` is not found, then you
+   need to add it to your `PATH` as described above in item 3.2
+
+6. When you run `cmd //C $vcvarsall`, it prints a bunch of environment
+   variables. Otherwise, check the path to the `vcvarsall.bat` in `$vcvarsall`
+   script and fix it.
+
+### Building from vcpkg
+
+The jemalloc port in vcpkg is kept up to date by Microsoft team members and
+community contributors. The url of vcpkg is: https://github.com/Microsoft/vcpkg
+. You can download and install jemalloc using the vcpkg dependency manager:
+
+```shell
+git clone https://github.com/Microsoft/vcpkg.git
+cd vcpkg
+./bootstrap-vcpkg.sh  # ./bootstrap-vcpkg.bat for Windows
+./vcpkg integrate install
+./vcpkg install jemalloc
+```
+
+If the version is out of date, please [create an issue or pull
+request](https://github.com/Microsoft/vcpkg) on the vcpkg repository.

 ## Development

--- a/Makefile.in
+++ b/Makefile.in
@ -123,17 +123,19 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/san_bump.c \
 	$(srcroot)src/hook.c \
 	$(srcroot)src/hpa.c \
+	$(srcroot)src/hpa_central.c \
 	$(srcroot)src/hpa_hooks.c \
+	$(srcroot)src/hpa_utils.c \
 	$(srcroot)src/hpdata.c \
 	$(srcroot)src/inspect.c \
 	$(srcroot)src/large.c \
 	$(srcroot)src/log.c \
 	$(srcroot)src/malloc_io.c \
+	$(srcroot)src/conf.c \
 	$(srcroot)src/mutex.c \
 	$(srcroot)src/nstime.c \
 	$(srcroot)src/pa.c \
 	$(srcroot)src/pa_extra.c \
-	$(srcroot)src/pai.c \
 	$(srcroot)src/pac.c \
 	$(srcroot)src/pages.c \
 	$(srcroot)src/peak_event.c \
@ -141,6 +143,7 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/prof_data.c \
 	$(srcroot)src/prof_log.c \
 	$(srcroot)src/prof_recent.c \
+	$(srcroot)src/prof_stack_range.c \
 	$(srcroot)src/prof_stats.c \
 	$(srcroot)src/prof_sys.c \
 	$(srcroot)src/psset.c \
@ -153,8 +156,10 @@ C_SRCS := $(srcroot)src/jemalloc.c \
 	$(srcroot)src/tcache.c \
 	$(srcroot)src/test_hooks.c \
 	$(srcroot)src/thread_event.c \
+	$(srcroot)src/thread_event_registry.c \
 	$(srcroot)src/ticker.c \
 	$(srcroot)src/tsd.c \
+	$(srcroot)src/util.c \
 	$(srcroot)src/witness.c
 ifeq ($(enable_zone_allocator), 1)
 C_SRCS += $(srcroot)src/zone.c
@ -201,14 +206,21 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/atomic.c \
 	$(srcroot)test/unit/background_thread.c \
 	$(srcroot)test/unit/background_thread_enable.c \
+	$(srcroot)test/unit/background_thread_init.c \
 	$(srcroot)test/unit/base.c \
 	$(srcroot)test/unit/batch_alloc.c \
+	$(srcroot)test/unit/bin.c \
 	$(srcroot)test/unit/binshard.c \
 	$(srcroot)test/unit/bitmap.c \
 	$(srcroot)test/unit/bit_util.c \
 	$(srcroot)test/unit/buf_writer.c \
 	$(srcroot)test/unit/cache_bin.c \
 	$(srcroot)test/unit/ckh.c \
+	$(srcroot)test/unit/conf.c \
+	$(srcroot)test/unit/conf_init_0.c \
+	$(srcroot)test/unit/conf_init_1.c \
+	$(srcroot)test/unit/conf_init_confirm.c \
+	$(srcroot)test/unit/conf_parse.c \
 	$(srcroot)test/unit/counter.c \
 	$(srcroot)test/unit/decay.c \
 	$(srcroot)test/unit/div.c \
@ -224,6 +236,10 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/hash.c \
 	$(srcroot)test/unit/hook.c \
 	$(srcroot)test/unit/hpa.c \
+	$(srcroot)test/unit/hpa_sec_integration.c \
+	$(srcroot)test/unit/hpa_thp_always.c \
+	$(srcroot)test/unit/hpa_vectorized_madvise.c \
+	$(srcroot)test/unit/hpa_vectorized_madvise_large_batch.c \
 	$(srcroot)test/unit/hpa_background_thread.c \
 	$(srcroot)test/unit/hpdata.c \
 	$(srcroot)test/unit/huge.c \
@ -231,6 +247,8 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/junk.c \
 	$(srcroot)test/unit/junk_alloc.c \
 	$(srcroot)test/unit/junk_free.c \
+	$(srcroot)test/unit/json_stats.c \
+	$(srcroot)test/unit/large_ralloc.c \
 	$(srcroot)test/unit/log.c \
 	$(srcroot)test/unit/mallctl.c \
 	$(srcroot)test/unit/malloc_conf_2.c \
@ -240,6 +258,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/mq.c \
 	$(srcroot)test/unit/mtx.c \
 	$(srcroot)test/unit/nstime.c \
+	$(srcroot)test/unit/ncached_max.c \
 	$(srcroot)test/unit/oversize_threshold.c \
 	$(srcroot)test/unit/pa.c \
 	$(srcroot)test/unit/pack.c \
@ -256,6 +275,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/prof_mdump.c \
 	$(srcroot)test/unit/prof_recent.c \
 	$(srcroot)test/unit/prof_reset.c \
+	$(srcroot)test/unit/prof_small.c \
 	$(srcroot)test/unit/prof_stats.c \
 	$(srcroot)test/unit/prof_tctx.c \
 	$(srcroot)test/unit/prof_thread_name.c \
@ -279,6 +299,7 @@ TESTS_UNIT := \
 	$(srcroot)test/unit/stats.c \
 	$(srcroot)test/unit/stats_print.c \
 	$(srcroot)test/unit/sz.c \
+	$(srcroot)test/unit/tcache_init.c \
 	$(srcroot)test/unit/tcache_max.c \
 	$(srcroot)test/unit/test_hooks.c \
 	$(srcroot)test/unit/thread_event.c \
@ -332,10 +353,15 @@ TESTS_STRESS := $(srcroot)test/stress/batch_alloc.c \
 	$(srcroot)test/stress/large_microbench.c \
 	$(srcroot)test/stress/mallctl.c \
 	$(srcroot)test/stress/microbench.c
+ifeq (@enable_cxx@, 1)
+TESTS_STRESS_CPP := $(srcroot)test/stress/cpp/microbench.cpp
+else
+TESTS_STRESS_CPP :=
+endif


 TESTS := $(TESTS_UNIT) $(TESTS_INTEGRATION) $(TESTS_INTEGRATION_CPP) \
-	$(TESTS_ANALYZE) $(TESTS_STRESS)
+	$(TESTS_ANALYZE) $(TESTS_STRESS) $(TESTS_STRESS_CPP)

 PRIVATE_NAMESPACE_HDRS := $(objroot)include/jemalloc/internal/private_namespace.h $(objroot)include/jemalloc/internal/private_namespace_jet.h
 PRIVATE_NAMESPACE_GEN_HDRS := $(PRIVATE_NAMESPACE_HDRS:%.h=%.gen.h)
@ -362,9 +388,10 @@ TESTS_INTEGRATION_OBJS := $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_INTEGRATION_CPP_OBJS := $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
 TESTS_ANALYZE_OBJS := $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%.$(O))
 TESTS_STRESS_OBJS := $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%.$(O))
+TESTS_STRESS_CPP_OBJS := $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%.$(O))
 TESTS_OBJS := $(TESTS_UNIT_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_ANALYZE_OBJS) \
 	$(TESTS_STRESS_OBJS)
-TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS)
+TESTS_CPP_OBJS := $(TESTS_INTEGRATION_CPP_OBJS) $(TESTS_STRESS_CPP_OBJS)

 .PHONY: all dist build_doc_html build_doc_man build_doc
 .PHONY: install_bin install_include install_lib
@ -454,10 +481,13 @@ $(TESTS_INTEGRATION_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_TEST
 $(TESTS_INTEGRATION_CPP_OBJS): CPPFLAGS += -DJEMALLOC_INTEGRATION_CPP_TEST
 $(TESTS_ANALYZE_OBJS): CPPFLAGS += -DJEMALLOC_ANALYZE_TEST
 $(TESTS_STRESS_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_TEST
+$(TESTS_STRESS_CPP_OBJS): CPPFLAGS += -DJEMALLOC_STRESS_CPP_TEST
 $(TESTS_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.c
 $(TESTS_CPP_OBJS): $(objroot)test/%.$(O): $(srcroot)test/%.cpp
 $(TESTS_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
 $(TESTS_CPP_OBJS): CPPFLAGS += -I$(srcroot)test/include -I$(objroot)test/include
+$(TESTS_OBJS): CFLAGS += -fno-builtin
+$(TESTS_CPP_OBJS): CPPFLAGS += -fno-builtin
 ifneq ($(IMPORTLIB),$(SO))
 $(CPP_OBJS) $(C_SYM_OBJS) $(C_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS): CPPFLAGS += -DDLLEXPORT
 endif
@ -472,7 +502,7 @@ $(TESTS_OBJS) $(TESTS_CPP_OBJS): $(objroot)test/include/test/jemalloc_test.h
 endif

 $(C_OBJS) $(CPP_OBJS) $(C_PIC_OBJS) $(CPP_PIC_OBJS) $(C_TESTLIB_INTEGRATION_OBJS) $(C_UTIL_INTEGRATION_OBJS) $(TESTS_INTEGRATION_OBJS) $(TESTS_INTEGRATION_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace.h
-$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h
+$(C_JET_OBJS) $(C_TESTLIB_UNIT_OBJS) $(C_TESTLIB_ANALYZE_OBJS) $(C_TESTLIB_STRESS_OBJS) $(TESTS_UNIT_OBJS) $(TESTS_ANALYZE_OBJS) $(TESTS_STRESS_OBJS) $(TESTS_STRESS_CPP_OBJS): $(objroot)include/jemalloc/internal/private_namespace_jet.h

 $(C_SYM_OBJS) $(C_OBJS) $(C_PIC_OBJS) $(C_JET_SYM_OBJS) $(C_JET_OBJS) $(C_TESTLIB_OBJS) $(TESTS_OBJS): %.$(O):
 	@mkdir -p $(@D)
@ -513,7 +543,11 @@ endif

 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) : $(if $(PIC_CFLAGS),$(C_PIC_OBJS),$(C_OBJS)) $(if $(PIC_CFLAGS),$(CPP_PIC_OBJS),$(CPP_OBJS))
 	@mkdir -p $(@D)
+ifeq (@enable_cxx@, 1)
+	$(CXX) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+else
 	$(CC) $(DSO_LDFLAGS) $(call RPATH,$(RPATH_EXTRA)) $(LDTARGET) $+ $(LDFLAGS) $(LIBS) $(EXTRA_LDFLAGS)
+endif

 $(objroot)lib/$(LIBJEMALLOC)_pic.$(A) : $(C_PIC_OBJS) $(CPP_PIC_OBJS)
 $(objroot)lib/$(LIBJEMALLOC).$(A) : $(C_OBJS) $(CPP_OBJS)
@ -543,6 +577,28 @@ $(objroot)test/stress/%$(EXE): $(objroot)test/stress/%.$(O) $(C_JET_OBJS) $(C_TE
 	@mkdir -p $(@D)
 	$(CC) $(TEST_LD_MODE) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(objroot)lib/$(LIBJEMALLOC).$(IMPORTLIB) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)

+$(objroot)test/stress/pa/pa_data_preprocessor$(EXE): $(objroot)test/stress/pa/pa_data_preprocessor.$(O)
+	@mkdir -p $(@D)
+	$(CXX) $(LDTARGET) $(filter %.$(O),$^) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/stress/pa/pa_microbench$(EXE): $(objroot)test/stress/pa/pa_microbench.$(O) $(C_JET_OBJS) $(C_TESTLIB_STRESS_OBJS)
+	@mkdir -p $(@D)
+	$(CC) $(LDTARGET) $(filter %.$(O),$^) $(call RPATH,$(objroot)lib) $(LDFLAGS) $(filter-out -lm,$(LIBS)) $(LM) $(EXTRA_LDFLAGS)
+
+$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.c
+	@mkdir -p $(@D)
+	$(CC) $(CFLAGS) -c $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
+ifdef CC_MM
+	@$(CC) -MM $(CPPFLAGS) -DJEMALLOC_STRESS_TEST -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
+$(objroot)test/stress/pa/%.$(O): $(srcroot)test/stress/pa/%.cpp
+	@mkdir -p $(@D)
+	$(CXX) $(CXXFLAGS) -c $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include $(CTARGET) $<
+ifdef CC_MM
+	@$(CXX) -MM $(CPPFLAGS) -I$(srcroot)test/include -I$(objroot)test/include -MT $@ -o $(@:%.$(O)=%.d) $<
+endif
+
 build_lib_shared: $(DSOS)
 build_lib_static: $(STATIC_LIBS)
 ifeq ($(enable_shared), 1)
@ -555,18 +611,20 @@ endif
 install_bin:
 	$(INSTALL) -d $(BINDIR)
 	@for b in $(BINS); do \
-	$(INSTALL) -v -m 755 $$b $(BINDIR); \
+	echo "$(INSTALL) -m 755 $$b $(BINDIR)"; \
+	$(INSTALL) -m 755 $$b $(BINDIR); \
 done

 install_include:
 	$(INSTALL) -d $(INCLUDEDIR)/jemalloc
 	@for h in $(C_HDRS); do \
-	$(INSTALL) -v -m 644 $$h $(INCLUDEDIR)/jemalloc; \
+	echo "$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc"; \
+	$(INSTALL) -m 644 $$h $(INCLUDEDIR)/jemalloc; \
 done

 install_lib_shared: $(DSOS)
 	$(INSTALL) -d $(LIBDIR)
-	$(INSTALL) -v -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
+	$(INSTALL) -m 755 $(objroot)lib/$(LIBJEMALLOC).$(SOREV) $(LIBDIR)
 ifneq ($(SOREV),$(SO))
 	ln -sf $(LIBJEMALLOC).$(SOREV) $(LIBDIR)/$(LIBJEMALLOC).$(SO)
 endif
@ -574,13 +632,15 @@ endif
 install_lib_static: $(STATIC_LIBS)
 	$(INSTALL) -d $(LIBDIR)
 	@for l in $(STATIC_LIBS); do \
-	$(INSTALL) -v -m 755 $$l $(LIBDIR); \
+	echo "$(INSTALL) -m 755 $$l $(LIBDIR)"; \
+	$(INSTALL) -m 755 $$l $(LIBDIR); \
 done

 install_lib_pc: $(PC)
 	$(INSTALL) -d $(LIBDIR)/pkgconfig
 	@for l in $(PC); do \
-	$(INSTALL) -v -m 644 $$l $(LIBDIR)/pkgconfig; \
+	echo "$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig"; \
+	$(INSTALL) -m 644 $$l $(LIBDIR)/pkgconfig; \
 done

 ifeq ($(enable_shared), 1)
@ -594,13 +654,15 @@ install_lib: install_lib_pc
 install_doc_html: build_doc_html
 	$(INSTALL) -d $(DATADIR)/doc/jemalloc$(install_suffix)
 	@for d in $(DOCS_HTML); do \
-	$(INSTALL) -v -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
+	echo "$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix)"; \
+	$(INSTALL) -m 644 $$d $(DATADIR)/doc/jemalloc$(install_suffix); \
 done

 install_doc_man: build_doc_man
 	$(INSTALL) -d $(MANDIR)/man3
 	@for d in $(DOCS_MAN3); do \
-	$(INSTALL) -v -m 644 $$d $(MANDIR)/man3; \
+	echo "$(INSTALL) -m 644 $$d $(MANDIR)/man3"; \
+	$(INSTALL) -m 644 $$d $(MANDIR)/man3; \
 done

 install_doc: install_doc_html install_doc_man
@ -656,7 +718,8 @@ endif
 tests_unit: $(TESTS_UNIT:$(srcroot)%.c=$(objroot)%$(EXE))
 tests_integration: $(TESTS_INTEGRATION:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_INTEGRATION_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
 tests_analyze: $(TESTS_ANALYZE:$(srcroot)%.c=$(objroot)%$(EXE))
-tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE))
+tests_stress: $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%$(EXE)) $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%$(EXE))
+tests_pa: $(objroot)test/stress/pa/pa_data_preprocessor$(EXE) $(objroot)test/stress/pa/pa_microbench$(EXE)
 tests: tests_unit tests_integration tests_analyze tests_stress

 check_unit_dir:
@ -689,6 +752,7 @@ else
 endif
 stress: tests_stress stress_dir
 	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS:$(srcroot)%.c=$(objroot)%)
+	$(SHELL) $(objroot)test/test.sh $(TESTS_STRESS_CPP:$(srcroot)%.cpp=$(objroot)%)
 check: check_unit check_integration check_integration_decay check_integration_prof

 clean:
--- a/2
+++ b/2
@ -17,4 +17,4 @@ jemalloc.

 The ChangeLog file contains a brief summary of changes for each release.

-URL: http://jemalloc.net/
+URL: https://jemalloc.net/
--- a/TUNING.md
+++ b/TUNING.md
@ -11,9 +11,9 @@ by a few percent, or make favorable trade-offs.
 ## Notable runtime options for performance tuning

 Runtime options can be set via
-[malloc_conf](http://jemalloc.net/jemalloc.3.html#tuning).
+[malloc_conf](https://jemalloc.net/jemalloc.3.html#tuning).

-* [background_thread](http://jemalloc.net/jemalloc.3.html#background_thread)
+* [background_thread](https://jemalloc.net/jemalloc.3.html#background_thread)

    Enabling jemalloc background threads generally improves the tail latency for
    application threads, since unused memory purging is shifted to the dedicated
@ -23,7 +23,7 @@ Runtime options can be set via
    Suggested: `background_thread:true` when jemalloc managed threads can be
    allowed.

-* [metadata_thp](http://jemalloc.net/jemalloc.3.html#opt.metadata_thp)
+* [metadata_thp](https://jemalloc.net/jemalloc.3.html#opt.metadata_thp)

    Allowing jemalloc to utilize transparent huge pages for its internal
    metadata usually reduces TLB misses significantly, especially for programs
@ -35,8 +35,8 @@ Runtime options can be set via
    `metadata_thp:always`, which is expected to improve CPU utilization at a
    small memory cost.

-* [dirty_decay_ms](http://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms) and
-  [muzzy_decay_ms](http://jemalloc.net/jemalloc.3.html#opt.muzzy_decay_ms)
+* [dirty_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.dirty_decay_ms) and
+  [muzzy_decay_ms](https://jemalloc.net/jemalloc.3.html#opt.muzzy_decay_ms)

    Decay time determines how fast jemalloc returns unused pages back to the
    operating system, and therefore provides a fairly straightforward trade-off
@ -46,7 +46,7 @@ Runtime options can be set via

    Suggested: tune the values based on the desired trade-offs.

-* [narenas](http://jemalloc.net/jemalloc.3.html#opt.narenas)
+* [narenas](https://jemalloc.net/jemalloc.3.html#opt.narenas)

    By default jemalloc uses multiple arenas to reduce internal lock contention.
    However high arena count may also increase overall memory fragmentation,
@ -57,7 +57,7 @@ Runtime options can be set via
    Suggested: if low parallelism is expected, try lower arena count while
    monitoring CPU and memory usage.

-* [percpu_arena](http://jemalloc.net/jemalloc.3.html#opt.percpu_arena)
+* [percpu_arena](https://jemalloc.net/jemalloc.3.html#opt.percpu_arena)

    Enable dynamic thread to arena association based on running CPU.  This has
    the potential to improve locality, e.g. when thread to CPU affinity is
@ -100,28 +100,28 @@ aborts immediately on illegal options.
 In addition to the runtime options, there are a number of programmatic ways to
 improve application performance with jemalloc.

-* [Explicit arenas](http://jemalloc.net/jemalloc.3.html#arenas.create)
+* [Explicit arenas](https://jemalloc.net/jemalloc.3.html#arenas.create)

    Manually created arenas can help performance in various ways, e.g. by
    managing locality and contention for specific usages.  For example,
    applications can explicitly allocate frequently accessed objects from a
    dedicated arena with
-    [mallocx()](http://jemalloc.net/jemalloc.3.html#MALLOCX_ARENA) to improve
+    [mallocx()](https://jemalloc.net/jemalloc.3.html#MALLOCX_ARENA) to improve
    locality.  In addition, explicit arenas often benefit from individually
    tuned options, e.g. relaxed [decay
-    time](http://jemalloc.net/jemalloc.3.html#arena.i.dirty_decay_ms) if
+    time](https://jemalloc.net/jemalloc.3.html#arena.i.dirty_decay_ms) if
    frequent reuse is expected.

-* [Extent hooks](http://jemalloc.net/jemalloc.3.html#arena.i.extent_hooks)
+* [Extent hooks](https://jemalloc.net/jemalloc.3.html#arena.i.extent_hooks)

    Extent hooks allow customization for managing underlying memory.  One use
    case for performance purpose is to utilize huge pages -- for example,
-    [HHVM](https://github.com/facebook/hhvm/blob/master/hphp/util/alloc.cpp)
+    [HHVM](httpss://github.com/facebook/hhvm/blob/master/hphp/util/alloc.cpp)
    uses explicit arenas with customized extent hooks to manage 1GB huge pages
    for frequently accessed data, which reduces TLB misses significantly.

 * [Explicit thread-to-arena
-  binding](http://jemalloc.net/jemalloc.3.html#thread.arena)
+  binding](https://jemalloc.net/jemalloc.3.html#thread.arena)

    It is common for some threads in an application to have different memory
    access / allocation patterns.  Threads with heavy workloads often benefit
--- a/autogen.sh
+++ b/autogen.sh
@ -9,8 +9,8 @@ for i in autoconf; do
    fi
 done

-echo "./configure --enable-autogen $@"
-./configure --enable-autogen $@
+echo "./configure --enable-autogen \"$@\""
+./configure --enable-autogen "$@"
 if [ $? -ne 0 ]; then
    echo "Error $? in ./configure"
    exit 1
--- a/bin/jeprof.in
+++ b/bin/jeprof.in
@ -88,6 +88,7 @@ my %obj_tool_map = (
  #"nm_pdb" => "nm-pdb",       # for reading windows (PDB-format) executables
  #"addr2line_pdb" => "addr2line-pdb",                                # ditto
  #"otool" => "otool",         # equivalent of objdump on OS X
+  #"dyld_info" => "dyld_info",   # equivalent of otool on OS X for shared cache
 );
 # NOTE: these are lists, so you can put in commandline flags if you want.
 my @DOT = ("dot");          # leave non-absolute, since it may be in /usr/local
@ -688,15 +689,15 @@ sub Main() {
  my $symbol_map = {};

  # Read one profile, pick the last item on the list
-  my $data = ReadProfile($main::prog, pop(@main::profile_files));
+  my $data = ReadProfile($main::prog, $main::profile_files[0]);
  my $profile = $data->{profile};
  my $pcs = $data->{pcs};
  my $libs = $data->{libs};   # Info about main program and shared libraries
  $symbol_map = MergeSymbols($symbol_map, $data->{symbols});

  # Add additional profiles, if available.
-  if (scalar(@main::profile_files) > 0) {
-    foreach my $pname (@main::profile_files) {
+  if (scalar(@main::profile_files) > 1) {
+    foreach my $pname (@main::profile_files[1..$#main::profile_files]) {
      my $data2 = ReadProfile($main::prog, $pname);
      $profile = AddProfile($profile, $data2->{profile});
      $pcs = AddPcs($pcs, $data2->{pcs});
@ -2955,8 +2956,25 @@ sub RemoveUninterestingFrames {
    foreach my $name ('@JEMALLOC_PREFIX@calloc',
                      'cfree',
                      '@JEMALLOC_PREFIX@malloc',
+                      'je_malloc_default',
                      'newImpl',
                      'void* newImpl',
+                      'fallbackNewImpl',
+                      'void* fallbackNewImpl',
+                      'fallback_impl',
+                      'void* fallback_impl',
+                      'imalloc',
+                      'int imalloc',
+                      'imalloc_body',
+                      'int imalloc_body',
+                      'prof_alloc_prep',
+                      'prof_tctx_t *prof_alloc_prep',
+                      'prof_backtrace_impl',
+                      'void prof_backtrace_impl',
+                      'je_prof_backtrace',
+                      'void je_prof_backtrace',
+                      'je_prof_tctx_create',
+                      'prof_tctx_t* prof_tctx_create',
                      '@JEMALLOC_PREFIX@free',
                      '@JEMALLOC_PREFIX@memalign',
                      '@JEMALLOC_PREFIX@posix_memalign',
@ -2965,7 +2983,12 @@ sub RemoveUninterestingFrames {
                      '@JEMALLOC_PREFIX@valloc',
                      '@JEMALLOC_PREFIX@realloc',
                      '@JEMALLOC_PREFIX@mallocx',
+                      'irallocx_prof',
+                      'void *irallocx_prof',
                      '@JEMALLOC_PREFIX@rallocx',
+                      'do_rallocx',
+                      'ixallocx_prof',
+                      'size_t ixallocx_prof',
                      '@JEMALLOC_PREFIX@xallocx',
                      '@JEMALLOC_PREFIX@dallocx',
                      '@JEMALLOC_PREFIX@sdallocx',
@ -3078,6 +3101,8 @@ sub RemoveUninterestingFrames {
    foreach my $a (@addrs) {
      if (exists($symbols->{$a})) {
        my $func = $symbols->{$a}->[0];
+        # Remove suffix in the symbols following space when filtering.
+        $func =~ s/ .*//;
        if ($skip{$func} || ($func =~ m/$skip_regexp/)) {
          # Throw away the portion of the backtrace seen so far, under the
          # assumption that previous frames were for functions internal to the
@ -4500,19 +4525,19 @@ sub FindLibrary {
 # For libc libraries, the copy in /usr/lib/debug contains debugging symbols
 sub DebuggingLibrary {
  my $file = shift;
-      
+
  if ($file !~ m|^/|) {
    return undef;
  }
-      
+
  # Find debug symbol file if it's named after the library's name.
-  
-  if (-f "/usr/lib/debug$file") {                 
+
+  if (-f "/usr/lib/debug$file") {
    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file\n"; }
    return "/usr/lib/debug$file";
  } elsif (-f "/usr/lib/debug$file.debug") {
    if($main::opt_debug) { print STDERR "found debug info for $file in /usr/lib/debug$file.debug\n"; }
-    return "/usr/lib/debug$file.debug"; 
+    return "/usr/lib/debug$file.debug";
  }

  if(!$main::opt_debug_syms_by_id) {
@ -4521,7 +4546,7 @@ sub DebuggingLibrary {
  }

  # Find debug file if it's named after the library's build ID.
-  
+
  my $readelf = '';
  if (!$main::gave_up_on_elfutils) {
    $readelf = qx/eu-readelf -n ${file}/;
@ -4657,7 +4682,65 @@ sub ParseTextSectionHeaderFromOtool {
  return $r;
 }

+# Parse text section header of a library in OS X shared cache using dyld_info
+sub ParseTextSectionHeaderFromDyldInfo {
+  my $lib = shift;
+
+  my $size = undef;
+  my $vma;
+  my $file_offset;
+  # Get dyld_info output from the library file to figure out how to
+  # map between mapped addresses and addresses in the library.
+  my $cmd = ShellEscape($obj_tool_map{"dyld_info"}, "-segments", $lib);
+  open(DYLD, "$cmd |") || error("$cmd: $!\n");
+
+  while (<DYLD>) {
+    s/\r//g;         # turn windows-looking lines into unix-looking lines
+    # -segments:
+    #    load-address    segment section        sect-size  seg-size perm
+    #     0x1803E0000    __TEXT                                   112KB r.x
+    #     0x1803E4F34             __text            80960
+    #     0x1803F8B74             __auth_stubs        768
+    #     0x1803F8E74             __init_offsets        4
+    #     0x1803F8E78             __gcc_except_tab   1180
+    my @x = split;
+    if ($#x >= 2) {
+      if ($x[0] eq 'load-offset') {
+        # dyld_info should only be used for the shared lib.
+        return undef;
+      } elsif ($x[1] eq '__TEXT') {
+        $file_offset = $x[0];
+      } elsif ($x[1] eq '__text') {
+        $size = $x[2];
+        $vma = $x[0];
+        $file_offset = AddressSub($x[0], $file_offset);
+        last;
+      }
+    }
+  }
+  close(DYLD);
+
+  if (!defined($vma) || !defined($size) || !defined($file_offset)) {
+     return undef;
+  }
+
+  my $r = {};
+  $r->{size} = $size;
+  $r->{vma} = $vma;
+  $r->{file_offset} = $file_offset;
+
+  return $r;
+}
+
 sub ParseTextSectionHeader {
+  # obj_tool_map("dyld_info") is only defined if we're in a Mach-O environment
+  if (defined($obj_tool_map{"dyld_info"})) {
+    my $r = ParseTextSectionHeaderFromDyldInfo(@_);
+    if (defined($r)){
+      return $r;
+    }
+  }
+  # if dyld_info doesn't work, or we don't have it, fall back to otool
  # obj_tool_map("otool") is only defined if we're in a Mach-O environment
  if (defined($obj_tool_map{"otool"})) {
    my $r = ParseTextSectionHeaderFromOtool(@_);
@ -4698,7 +4781,7 @@ sub ParseLibraries {
      $offset = HexExtend($3);
      $lib = $4;
      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
-    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.so(\.\d+)*)/) {
+    } elsif ($l =~ /^\s*($h)-($h):\s*(\S+\.(so|dll|dylib|bundle)(\.\d+)*)/) {
      # Cooked line from DumpAddressMap.  Example:
      #   40000000-40015000: /lib/ld-2.3.2.so
      $start = HexExtend($1);
@ -4715,6 +4798,15 @@ sub ParseLibraries {
      $offset = HexExtend($3);
      $lib = $4;
      $lib =~ s|\\|/|g;     # turn windows-style paths into unix-style paths
+    } elsif (($l =~ /^\s*($h)-($h):\s*(\S+)/) && ($3 eq $prog)) {
+      # PIEs and address space randomization do not play well with our
+      # default assumption that main executable is at lowest
+      # addresses. So we're detecting main executable from
+      # DumpAddressMap as well.
+      $start = HexExtend($1);
+      $finish = HexExtend($2);
+      $offset = $zero_offset;
+      $lib = $3;
    }
    # FreeBSD 10.0 virtual memory map /proc/curproc/map as defined in
    # function procfs_doprocmap (sys/fs/procfs/procfs_map.c)
@ -5245,6 +5337,7 @@ sub ConfigureObjTools {
  if ($file_type =~ /Mach-O/) {
    # OS X uses otool to examine Mach-O files, rather than objdump.
    $obj_tool_map{"otool"} = "otool";
+    $obj_tool_map{"dyld_info"} = "dyld_info";
    $obj_tool_map{"addr2line"} = "false";  # no addr2line
    $obj_tool_map{"objdump"} = "false";  # no objdump
  }
--- a/build-aux/config.guess
+++ b/build-aux/config.guess
--- a/build-aux/config.sub
+++ b/build-aux/config.sub
--- a/build-aux/install-sh
+++ b/build-aux/install-sh
@ -115,7 +115,7 @@ fi
 if [ x"$dir_arg" != x ]; then
 	dst=$src
 	src=""
-	
+
 	if [ -d $dst ]; then
 		instcmd=:
 	else
@ -124,7 +124,7 @@ if [ x"$dir_arg" != x ]; then
 else

 # Waiting for this to be detected by the "$instcmd $src $dsttmp" command
-# might cause directories to be created, which would be especially bad 
+# might cause directories to be created, which would be especially bad
 # if $src (and thus $dsttmp) contains '*'.

 	if [ -f $src -o -d $src ]
@ -134,7 +134,7 @@ else
 		echo "install:  $src does not exist"
 		exit 1
 	fi
-	
+
 	if [ x"$dst" = x ]
 	then
 		echo "install:	no destination specified"
@ -201,17 +201,17 @@ else

 # If we're going to rename the final executable, determine the name now.

-	if [ x"$transformarg" = x ] 
+	if [ x"$transformarg" = x ]
 	then
 		dstfile=`basename $dst`
 	else
-		dstfile=`basename $dst $transformbasename | 
+		dstfile=`basename $dst $transformbasename |
 			sed $transformarg`$transformbasename
 	fi

 # don't allow the sed command to completely eliminate the filename

-	if [ x"$dstfile" = x ] 
+	if [ x"$dstfile" = x ]
 	then
 		dstfile=`basename $dst`
 	else
@ -242,7 +242,7 @@ else
 # Now rename the file to the real destination.

 	$doit $rmcmd -f $dstdir/$dstfile &&
-	$doit $mvcmd $dsttmp $dstdir/$dstfile 
+	$doit $mvcmd $dsttmp $dstdir/$dstfile

 fi &&

--- a/configure.ac
+++ b/configure.ac
@ -92,6 +92,32 @@ AC_LANG_POP([C++])
 JE_CONCAT_VVV(CXXFLAGS, CONFIGURE_CXXFLAGS, SPECIFIED_CXXFLAGS)
 ])

+CONFIGURE_LDFLAGS=
+SPECIFIED_LDFLAGS="${LDFLAGS}"
+dnl JE_LDFLAGS_ADD(ldflag)
+dnl
+dnl LDFLAGS is the concatenation of CONFIGURE_LDFLAGS and SPECIFIED_LDFLAGS
+dnl This macro appends to CONFIGURE_LDFLAGS and regenerates LDFLAGS.
+AC_DEFUN([JE_LDFLAGS_ADD],
+[
+AC_MSG_CHECKING([whether linker supports $1])
+T_CONFIGURE_LDFLAGS="${CONFIGURE_LDFLAGS}"
+JE_APPEND_VS(CONFIGURE_LDFLAGS, $1)
+JE_CONCAT_VVV(LDFLAGS, CONFIGURE_LDFLAGS, SPECIFIED_LDFLAGS)
+AC_LINK_IFELSE([AC_LANG_PROGRAM(
+[[
+]], [[
+    return 0;
+]])],
+              [je_cv_ldflags_added=$1]
+              AC_MSG_RESULT([yes]),
+              [je_cv_ldflags_added=]
+              AC_MSG_RESULT([no])
+              [CONFIGURE_LDFLAGS="${T_CONFIGURE_LDFLAGS}"]
+)
+JE_CONCAT_VVV(LDFLAGS, CONFIGURE_LDFLAGS, SPECIFIED_LDFLAGS)
+])
+
 dnl JE_COMPILABLE(label, hcode, mcode, rvar)
 dnl
 dnl Use AC_LINK_IFELSE() rather than AC_COMPILE_IFELSE() so that linker errors
@ -298,6 +324,15 @@ fi
 ,
 enable_cxx="1"
 )
+AC_ARG_WITH([cxx_stdlib],
+  [AS_HELP_STRING([--with-cxx-stdlib=<libstdc++|libcxx>],
+  [Specify the C++ standard library to link (default: probe for libstdc++)])],
+  [case "${with_cxx_stdlib}" in
+    libstdc++|libcxx) ;;
+    *) AC_MSG_ERROR([bad value ${with_cxx_stdlib} for --with-cxx-stdlib]) ;;
+  esac],
+  [with_cxx_stdlib=""]
+)
 if test "x$enable_cxx" = "x1" ; then
  dnl Require at least c++14, which is the first version to support sized
  dnl deallocation.  C++ support is not compiled otherwise.
@ -312,17 +347,28 @@ if test "x$enable_cxx" = "x1" ; then
    JE_CXXFLAGS_ADD([-g3])

    SAVED_LIBS="${LIBS}"
-    JE_APPEND_VS(LIBS, -lstdc++)
-    JE_COMPILABLE([libstdc++ linkage], [
+    case "${with_cxx_stdlib}" in
+      libstdc++)
+        JE_APPEND_VS(LIBS, -lstdc++)
+        ;;
+      libcxx)
+        JE_APPEND_VS(LIBS, -lc++)
+        ;;
+      *)
+        dnl Probe for libstdc++ (the default when --with-cxx-stdlib is not given).
+        JE_APPEND_VS(LIBS, -lstdc++)
+        JE_COMPILABLE([libstdc++ linkage], [
 #include <stdlib.h>
 ], [[
 	int *arr = (int *)malloc(sizeof(int) * 42);
 	if (arr == NULL)
 		return 1;
 ]], [je_cv_libstdcxx])
-    if test "x${je_cv_libstdcxx}" = "xno" ; then
-      LIBS="${SAVED_LIBS}"
-    fi
+        if test "x${je_cv_libstdcxx}" = "xno" ; then
+          LIBS="${SAVED_LIBS}"
+        fi
+        ;;
+    esac
  else
    enable_cxx="0"
  fi
@ -510,6 +556,23 @@ typedef unsigned __int32 uint32_t;
      else
        AC_MSG_ERROR([cannot determine number of significant virtual address bits])
      fi
+      AC_CACHE_CHECK([rdtscp support],
+		     [je_cv_rdtscp],
+		     AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+#include <stdint.h>
+]],
+[[
+      unsigned int dx;
+      asm volatile("rdtscp" : "=d"(dx) ::);
+      return 0;
+]])],
+      [je_cv_rdtscp=yes],
+      [je_cv_rdtscp=no],
+      [je_cv_rdtscp=no]))
+      if test "x${je_cv_rdtscp}" = "xyes"; then
+        AC_DEFINE([JEMALLOC_HAVE_RDTSCP], [ ], [ ])
+      fi
    fi
    ;;
  *)
@ -529,6 +592,37 @@ typedef unsigned __int32 uint32_t;
    ;;
 esac
 AC_DEFINE_UNQUOTED([LG_VADDR], [$LG_VADDR], [ ])
+AC_CACHE_CHECK([asm volatile support],
+               [je_cv_asm_volatile],
+               AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+]],
+[[
+      void* ptr;
+      asm volatile("" : "+r"(ptr));
+      return 0;
+]])],
+[je_cv_asm_volatile=yes],
+[je_cv_asm_volatile=no],
+[je_cv_asm_volatile=no]))
+if test "x${je_cv_asm_volatile}" = "xyes"; then
+  AC_DEFINE([JEMALLOC_HAVE_ASM_VOLATILE], [ ], [ ])
+fi
+AC_CACHE_CHECK([__int128 support],
+               [je_cv_int128],
+               AC_RUN_IFELSE([AC_LANG_PROGRAM(
+[[
+]],
+[[
+      __int128 temp = 0;
+      return temp;
+]])],
+[je_cv_int128=yes],
+[je_cv_int128=no],
+[je_cv_int128=no]))
+if test "x${je_cv_int128}" = "xyes"; then
+  AC_DEFINE([JEMALLOC_HAVE_INT128], [ ], [ ])
+fi

 LD_PRELOAD_VAR="LD_PRELOAD"
 so="so"
@ -578,7 +672,7 @@ AC_ARG_WITH([version],
  [AS_HELP_STRING([--with-version=<major>.<minor>.<bugfix>-<nrev>-g<gid>],
   [Version string])],
  [
-    echo "${with_version}" | grep ['^[0-9]\+\.[0-9]\+\.[0-9]\+-[0-9]\+-g[0-9a-f]\+$'] 2>&1 1>/dev/null
+    echo "${with_version}" | grep ['^[0-9]\{1,\}\.[0-9]\{1,\}\.[0-9]\{1,\}-[0-9]\{1,\}-g[0-9a-f]\{1,\}$'] 2>&1 1>/dev/null
    if test $? -eq 0 ; then
      echo "$with_version" > "${objroot}VERSION"
    else
@ -654,6 +748,9 @@ case "${host}" in
 	SOREV="${rev}.${so}"
 	sbrk_deprecated="1"
 	SYM_PREFIX="_"
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
 	;;
  *-*-freebsd*)
 	JE_APPEND_VS(CPPFLAGS, -D_BSD_SOURCE)
@ -687,6 +784,19 @@ case "${host}" in
 	fi
 	zero_realloc_default_free="1"
 	;;
+  *-*-linux-musl*)
+	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
+	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
+	abi="elf"
+	AC_DEFINE([JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS], [ ], [ ])
+	AC_DEFINE([JEMALLOC_HAS_ALLOCA_H], [ ], [ ])
+	AC_DEFINE([JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY], [ ], [ ])
+	AC_DEFINE([JEMALLOC_THREADED_INIT], [ ], [ ])
+	if test "${LG_SIZEOF_PTR}" = "3"; then
+	  default_retain="1"
+	fi
+	zero_realloc_default_free="1"
+	;;
  *-*-linux*)
 	dnl syscall(2) and secure_getenv(3) are exposed by _GNU_SOURCE.
 	JE_APPEND_VS(CPPFLAGS, -D_GNU_SOURCE)
@ -829,11 +939,26 @@ AC_SUBST([DUMP_SYMS])
 AC_SUBST([CC_MM])

 dnl Determine whether libm must be linked to use e.g. log(3).
-AC_SEARCH_LIBS([log], [m], , [AC_MSG_ERROR([Missing math functions])])
-if test "x$ac_cv_search_log" != "xnone required" ; then
-  LM="$ac_cv_search_log"
-else
+
+# On MSVC, log is an intrinsic that doesn't require libm. However,
+# AC_SEARCH_LIBS does not successfully detect this, as it will try to compile
+# a program using the wrong signature for log. Newer versions of MSVC CL detects
+# this and rejects the program with the following messages.
+#
+# conftest.c(40): warning C4391: 'char log()': incorrect return type for intrinsic function, expected 'double'
+# conftest.c(44): error C2168: 'log': too few actual parameters for intrinsic function
+#
+# Since log is always available on MSVC (it's been around since the dawn of
+# time), we simply always assume it's there if MSVC is detected.
+if test "x$je_cv_msvc" = "xyes" ; then
  LM=
+else
+  AC_SEARCH_LIBS([log], [m], , [AC_MSG_ERROR([Missing math functions])])
+    if test "x$ac_cv_search_log" != "xnone required" ; then
+      LM="$ac_cv_search_log"
+    else
+      LM=
+    fi
 fi
 AC_SUBST(LM)

@ -939,6 +1064,30 @@ if test "x${je_cv_cold}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_HAVE_ATTR_COLD], [ ], [ ])
 fi

+dnl Check for deprecated attribute support.
+JE_CFLAGS_SAVE()
+JE_CFLAGS_ADD([-Wdeprecated-declarations])
+JE_COMPILABLE([deprecated attribute],
+              [#if !__has_attribute(deprecated)
+               #error "deprecated attribute not supported"
+               #endif
+               struct has_deprecated_field {
+                   int good;
+                   int __attribute__((deprecated("Do not use"))) bad;
+               };
+              ],
+              [struct has_deprecated_field instance;
+               instance.good = 0;
+               instance.bad = 1;
+              ],
+              [je_cv_deprecated])
+JE_CFLAGS_RESTORE()
+if test "x${je_cv_deprecated}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_ATTR_DEPRECATED], [ ], [ ])
+  JE_CFLAGS_ADD([-Wdeprecated-declarations])
+  JE_CXXFLAGS_ADD([-Wdeprecated-declarations])
+fi
+
 dnl Check for VM_MAKE_TAG for mmap support.
 JE_COMPILABLE([vm_make_tag],
 	      [#include <sys/mman.h>
@ -1052,11 +1201,11 @@ AC_SUBST([JEMALLOC_CPREFIX])
 AC_ARG_WITH([export],
  [AS_HELP_STRING([--without-export], [disable exporting jemalloc public APIs])],
  [if test "x$with_export" = "xno"; then
-  AC_DEFINE([JEMALLOC_EXPORT],[], [ ])
+  AC_DEFINE([JEMALLOC_EXPORT], [ ], [ ])
 fi]
 )

-public_syms="aligned_alloc calloc dallocx free mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
+public_syms="aligned_alloc calloc dallocx free free_sized free_aligned_sized mallctl mallctlbymib mallctlnametomib malloc malloc_conf malloc_conf_2_conf_harder malloc_message malloc_stats_print malloc_usable_size mallocx smallocx_${jemalloc_version_gid} nallocx posix_memalign rallocx realloc sallocx sdallocx xallocx"
 dnl Check for additional platform-specific public API functions.
 AC_CHECK_FUNC([memalign],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_MEMALIGN], [ ], [ ])
@ -1064,6 +1213,9 @@ AC_CHECK_FUNC([memalign],
 AC_CHECK_FUNC([valloc],
 	      [AC_DEFINE([JEMALLOC_OVERRIDE_VALLOC], [ ], [ ])
 	       public_syms="${public_syms} valloc"])
+AC_CHECK_FUNC([pvalloc],
+	      [AC_DEFINE([JEMALLOC_OVERRIDE_PVALLOC], [ ], [ ])
+	       public_syms="${public_syms} pvalloc"])
 AC_CHECK_FUNC([malloc_size],
 	      [AC_DEFINE([JEMALLOC_HAVE_MALLOC_SIZE], [ ], [ ])
 	       public_syms="${public_syms} malloc_size"])
@ -1077,6 +1229,16 @@ if test "x${JEMALLOC_PREFIX}" = "x" ; then
  AC_CHECK_FUNC([__libc_free],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_free"])
+  dnl __libc_free_sized and __libc_free_aligned_sized are here speculatively
+  dnl under the assumption that glibc will eventually define symbols with these
+  dnl names. In the event glibc chooses different names for these symbols,
+  dnl these will need to be amended to match.
+  AC_CHECK_FUNC([__libc_free_sized],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE_SIZED], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_free_sized"])
+  AC_CHECK_FUNC([__libc_free_aligned_sized],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_free_aligned_sized"])
  AC_CHECK_FUNC([__libc_malloc],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_MALLOC], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_malloc"])
@ -1089,6 +1251,9 @@ if test "x${JEMALLOC_PREFIX}" = "x" ; then
  AC_CHECK_FUNC([__libc_valloc],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_VALLOC], [ ], [ ])
 		 wrap_syms="${wrap_syms} __libc_valloc"])
+  AC_CHECK_FUNC([__libc_pvalloc],
+		[AC_DEFINE([JEMALLOC_OVERRIDE___LIBC_PVALLOC], [ ], [ ])
+		 wrap_syms="${wrap_syms} __libc_pvalloc"])
  AC_CHECK_FUNC([__posix_memalign],
 		[AC_DEFINE([JEMALLOC_OVERRIDE___POSIX_MEMALIGN], [ ], [ ])
 		 wrap_syms="${wrap_syms} __posix_memalign"])
@ -1256,6 +1421,23 @@ if test "x$enable_stats" = "x1" ; then
 fi
 AC_SUBST([enable_stats])

+dnl Disable reading configuration from file and environment variable
+AC_ARG_ENABLE([user_config],
+  [AS_HELP_STRING([--disable-user-config],
+  [Do not read malloc config from /etc/malloc.conf or MALLOC_CONF])],
+[if test "x$enable_user_config" = "xno" ; then
+  enable_user_config="0"
+else
+  enable_user_config="1"
+fi
+],
+[enable_user_config="1"]
+)
+if test "x$enable_user_config" = "x1" ; then
+  AC_DEFINE([JEMALLOC_CONFIG_ENV], [ ], [ ])
+  AC_DEFINE([JEMALLOC_CONFIG_FILE], [ ], [ ])
+fi
+
 dnl Do not enable smallocx by default.
 AC_ARG_ENABLE([experimental_smallocx],
  [AS_HELP_STRING([--enable-experimental-smallocx], [Enable experimental smallocx API])],
@ -1329,6 +1511,33 @@ if test "x$backtrace_method" = "x" -a "x$enable_prof_libunwind" = "x1" ; then
  fi
 fi

+if test `uname -s` = "Linux"
+then
+  AC_ARG_ENABLE([prof-frameptr],
+    [AS_HELP_STRING([--enable-prof-frameptr], [Use optimized frame pointer unwinder for backtracing (Linux only)])],
+  [if test "x$enable_prof_frameptr" = "xno" ; then
+    enable_prof_frameptr="0"
+  else
+    enable_prof_frameptr="1"
+    if test "x$enable_prof" = "x0" ; then
+      AC_MSG_ERROR([--enable-prof-frameptr should only be used with --enable-prof])
+    fi
+  fi
+  ],
+  [enable_prof_frameptr="0"]
+  )
+  if test "x$backtrace_method" = "x" -a "x$enable_prof_frameptr" = "x1" \
+      -a "x$GCC" = "xyes" ; then
+    JE_CFLAGS_ADD([-fno-omit-frame-pointer])
+    backtrace_method="frame pointer linux"
+    AC_DEFINE([JEMALLOC_PROF_FRAME_POINTER], [ ], [ ])
+  else
+    enable_prof_frameptr="0"
+  fi
+else
+  enable_prof_frameptr="0"
+fi
+
 AC_ARG_ENABLE([prof-libgcc],
  [AS_HELP_STRING([--disable-prof-libgcc],
  [Do not use libgcc for backtracing])],
@ -1404,6 +1613,18 @@ if test "x$zero_realloc_default_free" = "x1" ; then
  AC_DEFINE([JEMALLOC_ZERO_REALLOC_DEFAULT_FREE], [ ], [ ])
 fi

+dnl Support allocation from DSS by default
+AC_ARG_ENABLE([dss],
+  [AS_HELP_STRING([--disable-dss], [Disable usage of sbrk(2)])],
+[if test "x$enable_dss" = "xno" ; then
+  enable_dss="0"
+else
+  enable_dss="1"
+fi
+],
+[enable_dss="1"]
+)
+
 dnl Enable allocation from DSS if supported by the OS.
 have_dss="1"
 dnl Check whether the BSD/SUSv1 sbrk() exists.  If not, disable DSS support.
@ -1417,7 +1638,7 @@ else
  have_dss="0"
 fi

-if test "x$have_dss" = "x1" ; then
+if test "x$have_dss" = "x1" -a "x$enable_dss" = "x1" ; then
  AC_DEFINE([JEMALLOC_DSS], [ ], [ ])
 fi

@ -1480,6 +1701,55 @@ else
 fi
 AC_SUBST([enable_utrace])

+dnl Disable experimental sdt tracing by default.
+AC_ARG_ENABLE([experimental-sdt],
+  [AS_HELP_STRING([--enable-experimental-sdt], [Enable systemtap USDT probes])],
+[if test "x$enable_experimental_sdt" = "xno" ; then
+  enable_experimental_sdt="0"
+else
+	JE_COMPILABLE([systemtap sdt], [
+#include <sys/sdt.h>
+	], [
+void foo(int i, void *p) { STAP_PROBE2(jemalloc, test, i, p); }
+  	],
+	[je_cv_stap_sdt])
+
+	if test "x${je_cv_stap_sdt}" = "xyes" ; then
+	   enable_experimental_sdt="1"
+	elif test "x${abi}" = "xelf" ; then
+	     case "${host}" in
+	     	  *-*-linux-android*)
+			case "${host_cpu}" in aarch64|x86_64)
+			     enable_experimental_sdt="2"
+			     ;;
+			esac
+			;;
+		  *-*-linux*)
+			case "${host_cpu}" in x86_64|aarch64|arm*)
+			      enable_experimental_sdt="2"
+			      ;;
+			esac
+		        ;;
+		  *)
+			enable_experimental_sdt="0"
+			AC_MSG_ERROR([Unsupported sdt on this platform])
+			;;
+	     esac
+	else
+	   AC_MSG_ERROR([Unsupported sdt on this platform])
+   	fi
+fi
+],
+[enable_experimental_sdt="0"]
+)
+
+if test "x$enable_experimental_sdt" = "x1" ; then
+    AC_DEFINE([JEMALLOC_EXPERIMENTAL_USDT_STAP], [ ], [ ])
+elif test "x$enable_experimental_sdt" = "x2"; then
+    AC_DEFINE([JEMALLOC_EXPERIMENTAL_USDT_CUSTOM], [ ], [ ])
+fi
+AC_SUBST([enable_experimental_sdt])
+
 dnl Do not support the xmalloc option by default.
 AC_ARG_ENABLE([xmalloc],
  [AS_HELP_STRING([--enable-xmalloc], [Support xmalloc option])],
@ -1545,6 +1815,22 @@ if test "x$enable_readlinkat" = "x1" ; then
 fi
 AC_SUBST([enable_readlinkat])

+dnl Do not force getenv by default
+AC_ARG_ENABLE([force-getenv],
+  [AS_HELP_STRING([--enable-force-getenv], [Use getenv over secure_getenv])],
+[if test "x$enable_force_getenv" = "xno" ; then
+  enable_force_getenv="0"
+else
+  enable_force_getenv="1"
+fi
+],
+[enable_force_getenv="0"]
+)
+if test "x$enable_force_getenv" = "x1" ; then
+  AC_DEFINE([JEMALLOC_FORCE_GETENV], [ ], [ ])
+fi
+AC_SUBST([force_getenv])
+
 dnl Avoid extra safety checks by default
 AC_ARG_ENABLE([opt-safety-checks],
  [AS_HELP_STRING([--enable-opt-safety-checks],
@ -1592,7 +1878,7 @@ fi
 [enable_uaf_detection="0"]
 )
 if test "x$enable_uaf_detection" = "x1" ; then
-  AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ])
+  AC_DEFINE([JEMALLOC_UAF_DETECTION], [ ], [ ])
 fi
 AC_SUBST([enable_uaf_detection])

@ -1694,6 +1980,16 @@ case "${host}" in
        LG_PAGE=14
      fi
      ;;
+  *-*-linux-android)
+      if test "x$LG_PAGE" = "xdetect"; then
+	AC_CHECK_DECLS([PAGE_SIZE], [LG_PAGE=12], [LG_PAGE=14], [#include <sys/user.h>])
+      fi
+      ;;
+  aarch64-unknown-linux-*)
+      if test "x$LG_PAGE" = "xdetect"; then
+        LG_PAGE=16
+      fi
+      ;;
 esac
 if test "x$LG_PAGE" = "xdetect"; then
  AC_CACHE_CHECK([LG_PAGE],
@ -1758,7 +2054,7 @@ if test "x${je_cv_lg_hugepage}" = "x" ; then
  dnl   Hugepagesize:       2048 kB
  if test -e "/proc/meminfo" ; then
    hpsk=[`cat /proc/meminfo 2>/dev/null | \
-          grep -e '^Hugepagesize:[[:space:]]\+[0-9]\+[[:space:]]kB$' | \
+          grep '^Hugepagesize:[[:space:]]\{1,\}[0-9]\{1,\}[[:space:]]kB$' | \
          awk '{print $2}'`]
    if test "x${hpsk}" != "x" ; then
      je_cv_lg_hugepage=10
@ -1855,6 +2151,16 @@ dnl Check if we have dlsym support.
  if test "x${je_cv_pthread_getname_np}" = "xyes" ; then
    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_GETNAME_NP], [ ], [ ])
  fi
+  dnl Check if pthread_set_name_np is available with the expected API.
+  JE_COMPILABLE([pthread_set_name_np(3)], [
+#include <pthread.h>
+#include <pthread_np.h>
+], [
+  pthread_set_name_np(pthread_self(), "set_name_test");
+], [je_cv_pthread_set_name_np])
+  if test "x${je_cv_pthread_set_name_np}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SET_NAME_NP], [ ], [ ])
+  fi
  dnl Check if pthread_get_name_np is not necessarily present despite
  dnl the pthread_set_name_np counterpart
  JE_COMPILABLE([pthread_get_name_np(3)], [
@ -1942,6 +2248,16 @@ if test "x${je_cv_clock_realtime}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_HAVE_CLOCK_REALTIME], [ ], [ ])
 fi

+dnl Check for clock_gettime_nsec_np().
+JE_COMPILABLE([clock_gettime_nsec_np()], [
+#include <time.h>
+], [
+	clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
+], [je_cv_clock_gettime_nsec_np])
+if test "x${je_cv_clock_gettime_nsec_np}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_CLOCK_GETTIME_NSEC_NP], [ ], [ ])
+fi
+
 dnl Use syscall(2) (if available) by default.
 AC_ARG_ENABLE([syscall],
  [AS_HELP_STRING([--disable-syscall], [Disable use of syscall(2)])],
@ -1998,6 +2314,15 @@ if test "x$have_sched_setaffinity" = "x1" ; then
  AC_DEFINE([JEMALLOC_HAVE_SCHED_SETAFFINITY], [ ], [ ])
 fi

+dnl Check if the pthread_setaffinity_np function exists.
+AC_CHECK_FUNC([pthread_setaffinity_np],
+              [have_pthread_setaffinity_np="1"],
+              [have_pthread_setaffinity_np="0"]
+             )
+if test "x$have_pthread_setaffinity_np" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP], [ ], [ ])
+fi
+
 dnl Check if the Solaris/BSD issetugid function exists.
 AC_CHECK_FUNC([issetugid],
              [have_issetugid="1"],
@ -2041,6 +2366,14 @@ if test "x$have_memcntl" = "x1" ; then
  AC_DEFINE([JEMALLOC_HAVE_MEMCNTL], [ ], [ ])
 fi

+AC_CHECK_FUNC([prctl],
+	      [have_prctl="1"],
+	      [have_prctl="0"],
+	      )
+if test "x$have_prctl" = "x1" ; then
+  AC_DEFINE([JEMALLOC_HAVE_PRCTL], [ ], [ ])
+fi
+
 dnl Disable lazy locking by default.
 AC_ARG_ENABLE([lazy_lock],
  [AS_HELP_STRING([--enable-lazy-lock],
@ -2203,6 +2536,13 @@ if test "x${je_cv_osatomic}" = "xyes" ; then
 fi

 dnl ============================================================================
+
+AC_ARG_WITH([experimental_sys_process_madvise],
+  [AS_HELP_STRING([--with-experimental-sys-process-madvise=<experimental-sys-process-madvise>],
+   [Force process_madvise and use experimental-sys-process-madvise number when making syscall])],
+  [je_cv_sys_pmadv_nr="${with_experimental_sys_process_madvise}"],
+  [je_cv_sys_pmadv_nr=""])
+
 dnl Check for madvise(2).

 JE_COMPILABLE([madvise(2)], [
@ -2260,6 +2600,16 @@ if test "x${je_cv_madvise}" = "xyes" ; then
 	madvise((void *)0, 0, MADV_HUGEPAGE);
 	madvise((void *)0, 0, MADV_NOHUGEPAGE);
 ], [je_cv_thp])
+  case "${host_cpu}" in
+    arm*)
+      ;;
+    *)
+    if test "x${je_cv_thp}" = "xyes" ; then
+      AC_DEFINE([JEMALLOC_HAVE_MADVISE_HUGE], [ ], [ ])
+    fi
+    ;;
+  esac
+
  dnl Check for madvise(..., MADV_[NO]CORE).
  JE_COMPILABLE([madvise(..., MADV_[[NO]]CORE)], [
 #include <sys/mman.h>
@ -2270,15 +2620,35 @@ if test "x${je_cv_madvise}" = "xyes" ; then
  if test "x${je_cv_madv_nocore}" = "xyes" ; then
    AC_DEFINE([JEMALLOC_MADVISE_NOCORE], [ ], [ ])
  fi
-case "${host_cpu}" in
-  arm*)
-    ;;
-  *)
-  if test "x${je_cv_thp}" = "xyes" ; then
-    AC_DEFINE([JEMALLOC_HAVE_MADVISE_HUGE], [ ], [ ])
+
+  dnl Check for madvise(..., MADV_COLLAPSE).
+  JE_COMPILABLE([madvise(..., MADV_COLLAPSE)], [
+#include <sys/mman.h>
+], [
+	madvise((void *)0, 0, MADV_COLLAPSE);
+], [je_cv_madv_collapse])
+  if test "x${je_cv_madv_collapse}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_MADVISE_COLLAPSE], [ ], [ ])
+  fi
+
+  dnl Check for process_madvise
+  JE_COMPILABLE([process_madvise(2)], [
+#include <sys/pidfd.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+], [
+	syscall(SYS_process_madvise, PIDFD_SELF, (void *)0, 0, 0, 0);
+], [je_cv_process_madvise])
+  if test "x${je_cv_process_madvise}" = "xyes" ; then
+    AC_DEFINE([JEMALLOC_HAVE_PROCESS_MADVISE], [ ], [ ])
+  else
+    if test "x${je_cv_sys_pmadv_nr}" != "x" ; then
+      dnl Forcing experimental usage of process_madvise
+      AC_MSG_RESULT([Forcing usage of process_madvise with syscall nr=${je_cv_sys_pmadv_nr}])
+      AC_DEFINE([JEMALLOC_HAVE_PROCESS_MADVISE], [ ], [ ])
+      AC_DEFINE_UNQUOTED([EXPERIMENTAL_SYS_PROCESS_MADVISE_NR], [${je_cv_sys_pmadv_nr}], [ ])
+    fi
  fi
-  ;;
-esac
 else
  dnl Check for posix_madvise.
  JE_COMPILABLE([posix_madvise], [
@ -2403,12 +2773,62 @@ AC_SUBST([enable_initial_exec_tls])
 if test "x${je_cv_tls_model}" = "xyes" -a \
       "x${enable_initial_exec_tls}" = "x1" ; then
  AC_DEFINE([JEMALLOC_TLS_MODEL],
-            [__attribute__((tls_model("initial-exec")))], 
+            [__attribute__((tls_model("initial-exec")))],
            [ ])
 else
  AC_DEFINE([JEMALLOC_TLS_MODEL], [ ], [ ])
 fi

+dnl Do not compile with debugging by default.
+AC_ARG_ENABLE([pageid],
+  [AS_HELP_STRING([--enable-pageid],
+                  [Enable named pages])],
+[if test "x$enable_pageid" = "xno" ; then
+  enable_pageid="0"
+else
+  enable_pageid="1"
+fi
+],
+[enable_pageid="0"]
+)
+if test "x$enable_pageid" = "x1" ; then
+  AC_DEFINE([JEMALLOC_PAGEID], [ ], [ ])
+fi
+
+AC_ARG_ENABLE([tsan],
+  [AS_HELP_STRING([--enable-tsan],
+                  [Enable thread sanitizer])],
+[if test "x$enable_tsan" = "xno" ; then
+  enable_tsan="0"
+else
+  enable_tsan="1"
+fi
+],
+[enable_tsan="0"]
+)
+if test "x$enable_tsan" = "x1" ; then
+  JE_CFLAGS_ADD([-fsanitize=thread])
+  JE_CXXFLAGS_ADD([-fsanitize=thread])
+  JE_LDFLAGS_ADD([-fsanitize=thread])
+fi
+
+AC_ARG_ENABLE([ubsan],
+  [AS_HELP_STRING([--enable-ubsan],
+                  [Enable undefined behavior sanitizer])],
+[if test "x$enable_ubsan" = "xno" ; then
+  enable_ubsan="0"
+else
+  enable_ubsan="1"
+fi
+],
+[enable_ubsan="0"]
+)
+if test "x$enable_ubsan" = "x1" ; then
+  JE_CFLAGS_ADD([-fsanitize=undefined])
+  JE_CXXFLAGS_ADD([-fsanitize=undefined])
+  JE_LDFLAGS_ADD([-fsanitize=undefined])
+fi
+
 dnl ============================================================================
 dnl Enable background threads if possible.

@ -2468,6 +2888,15 @@ if test "x${je_cv_pthread_mutex_adaptive_np}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP], [ ], [ ])
 fi

+JE_COMPILABLE([gettid], [
+#include <unistd.h>
+], [
+  int tid = gettid();
+], [je_cv_gettid])
+if test "x${je_cv_gettid}" = "xyes" ; then
+  AC_DEFINE([JEMALLOC_HAVE_GETTID], [ ], [ ])
+fi
+
 JE_CFLAGS_SAVE()
 JE_CFLAGS_ADD([-D_GNU_SOURCE])
 JE_CFLAGS_ADD([-Werror])
@ -2482,9 +2911,19 @@ JE_COMPILABLE([strerror_r returns char with gnu source], [
  char *error = strerror_r(EINVAL, buffer, 100);
  printf("%s\n", error);
 ], [je_cv_strerror_r_returns_char_with_gnu_source])
+if test "x${je_cv_strerror_r_returns_char_with_gnu_source}" = "xno" ; then
+  JE_COMPILABLE([strerror_r header only], [
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+], [], [je_cv_strerror_r_header_pass])
+fi
 JE_CFLAGS_RESTORE()
 if test "x${je_cv_strerror_r_returns_char_with_gnu_source}" = "xyes" ; then
  AC_DEFINE([JEMALLOC_STRERROR_R_RETURNS_CHAR_WITH_GNU_SOURCE], [ ], [ ])
+elif test "x${je_cv_strerror_r_header_pass}" = "xno" ; then
+  AC_MSG_ERROR([cannot determine return type of strerror_r])
 fi

 dnl ============================================================================
@ -2622,7 +3061,8 @@ AC_MSG_RESULT([CXX                : ${CXX}])
 AC_MSG_RESULT([CONFIGURE_CXXFLAGS : ${CONFIGURE_CXXFLAGS}])
 AC_MSG_RESULT([SPECIFIED_CXXFLAGS : ${SPECIFIED_CXXFLAGS}])
 AC_MSG_RESULT([EXTRA_CXXFLAGS     : ${EXTRA_CXXFLAGS}])
-AC_MSG_RESULT([LDFLAGS            : ${LDFLAGS}])
+AC_MSG_RESULT([CONFIGURE_LDFLAGS  : ${CONFIGURE_LDFLAGS}])
+AC_MSG_RESULT([SPECIFIED_LDFLAGS  : ${SPECIFIED_LDFLAGS}])
 AC_MSG_RESULT([EXTRA_LDFLAGS      : ${EXTRA_LDFLAGS}])
 AC_MSG_RESULT([DSO_LDFLAGS        : ${DSO_LDFLAGS}])
 AC_MSG_RESULT([LIBS               : ${LIBS}])
@ -2638,6 +3078,8 @@ AC_MSG_RESULT([INCLUDEDIR         : ${INCLUDEDIR}])
 AC_MSG_RESULT([LIBDIR             : ${LIBDIR}])
 AC_MSG_RESULT([MANDIR             : ${MANDIR}])
 AC_MSG_RESULT([])
+AC_MSG_RESULT([LG_PAGE            : ${LG_PAGE}])
+AC_MSG_RESULT([])
 AC_MSG_RESULT([srcroot            : ${srcroot}])
 AC_MSG_RESULT([abs_srcroot        : ${abs_srcroot}])
 AC_MSG_RESULT([objroot            : ${objroot}])
@ -2654,9 +3096,11 @@ AC_MSG_RESULT([static libs        : ${enable_static}])
 AC_MSG_RESULT([autogen            : ${enable_autogen}])
 AC_MSG_RESULT([debug              : ${enable_debug}])
 AC_MSG_RESULT([stats              : ${enable_stats}])
+AC_MSG_RESULT([user_config        : ${enable_user_config}])
 AC_MSG_RESULT([experimental_smallocx : ${enable_experimental_smallocx}])
 AC_MSG_RESULT([prof               : ${enable_prof}])
 AC_MSG_RESULT([prof-libunwind     : ${enable_prof_libunwind}])
+AC_MSG_RESULT([prof-frameptr      : ${enable_prof_frameptr}])
 AC_MSG_RESULT([prof-libgcc        : ${enable_prof_libgcc}])
 AC_MSG_RESULT([prof-gcc           : ${enable_prof_gcc}])
 AC_MSG_RESULT([fill               : ${enable_fill}])
@ -2665,5 +3109,9 @@ AC_MSG_RESULT([xmalloc            : ${enable_xmalloc}])
 AC_MSG_RESULT([log                : ${enable_log}])
 AC_MSG_RESULT([lazy_lock          : ${enable_lazy_lock}])
 AC_MSG_RESULT([cache-oblivious    : ${enable_cache_oblivious}])
+AC_MSG_RESULT([pageid             : ${enable_pageid}])
 AC_MSG_RESULT([cxx                : ${enable_cxx}])
+AC_MSG_RESULT([dss                : ${enable_dss}])
+AC_MSG_RESULT([tsan               : ${enable_tsan}])
+AC_MSG_RESULT([ubsan              : ${enable_ubsan}])
 AC_MSG_RESULT([===============================================================================])
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@ -33,6 +33,8 @@
    <refname>aligned_alloc</refname>
    <refname>realloc</refname>
    <refname>free</refname>
+    <refname>free_sized</refname>
+    <refname>free_aligned_sized</refname>
    <refname>mallocx</refname>
    <refname>rallocx</refname>
    <refname>xallocx</refname>
@ -89,6 +91,17 @@
          <funcdef>void <function>free</function></funcdef>
          <paramdef>void *<parameter>ptr</parameter></paramdef>
        </funcprototype>
+        <funcprototype>
+          <funcdef>void <function>free_sized</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+        </funcprototype>
+        <funcprototype>
+          <funcdef>void <function>free_aligned_sized</function></funcdef>
+          <paramdef>void *<parameter>ptr</parameter></paramdef>
+          <paramdef>size_t <parameter>alignment</parameter></paramdef>
+          <paramdef>size_t <parameter>size</parameter></paramdef>
+        </funcprototype>
      </refsect2>
      <refsect2>
        <title>Non-standard API</title>
@ -227,6 +240,17 @@
      allocated memory referenced by <parameter>ptr</parameter> to be made
      available for future allocations.  If <parameter>ptr</parameter> is
      <constant>NULL</constant>, no action occurs.</para>
+
+      <para>The <function>free_sized()</function> function is an extension of
+      <function>free()</function> with a <parameter>size</parameter> parameter
+      to allow the caller to pass in the allocation size as an optimization.
+      </para>
+
+      <para>The <function>free_aligned_sized()</function> function accepts a
+      <parameter>ptr</parameter> which was allocated with a requested
+      <parameter>size</parameter> and <parameter>alignment</parameter>, causing
+      the allocated memory referenced by <parameter>ptr</parameter> to be made
+      available for future allocations.</para>
    </refsect2>
    <refsect2>
      <title>Non-standard API</title>
@ -451,6 +475,24 @@ for (i = 0; i < nbins; i++) {
      depended on, since such behavior is entirely implementation-dependent.
      </para>
    </refsect2>
+    <refsect2>
+      <title>Interactions Between the Standard and Non-standard APIs</title>
+      <para>Generally speaking it is permissible to pass pointers obtained from
+      the standard API to the non-standard API and vice versa (e.g. calling
+      <function>free()</function> with a pointer returned by a call to
+      <function>mallocx()</function>, calling <function>sdallocx()</function>
+      with a pointer returned by a call to <function>calloc()</function>).
+      There are however a few exceptions. In keeping with the C23 standard –
+      which forbids calling <function>free_sized()</function> on a pointer
+      returned by <function>aligned_alloc()</function>, mandating that either
+      <function>free_aligned_sized()</function> or <function>free()</function>
+      be used instead – using any combination of the standard and non-standard
+      APIs in an equivalent fashion (i.e. taking a pointer which was allocated
+      with an explicitly requested alignment and attempting to free it via an
+      API that accepts a size hint, without also providing the alignment hint)
+      is likewise forbidden.
+      </para>
+    </refsect2>
  </refsect1>
  <refsect1 id="tuning">
    <title>TUNING</title>
@ -1095,7 +1137,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
        </term>
        <listitem><para>Maximum number of background threads that will be created
        if <link linkend="background_thread">background_thread</link> is set.
-        Defaults to number of cpus.</para></listitem>
+        Defaults to 4.</para></listitem>
      </varlistentry>

      <varlistentry id="opt.dirty_decay_ms">
@ -1121,9 +1163,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
        linkend="arena.i.dirty_decay_ms"><mallctl>arena.&lt;i&gt;.dirty_decay_ms</mallctl></link>
        for related dynamic control options.  See <link
        linkend="opt.muzzy_decay_ms"><mallctl>opt.muzzy_decay_ms</mallctl></link>
-        for a description of muzzy pages.for a description of muzzy pages.  Note
-        that when the <link
-        linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
+        for a description of muzzy pages.  Note that when the <link linkend="opt.oversize_threshold"><mallctl>oversize_threshold</mallctl></link>
        feature is enabled, the arenas reserved for oversize requests may have
        its own default decay settings.</para></listitem>
      </varlistentry>
@ -1145,7 +1185,7 @@ mallctl("arena." STRINGIFY(MALLCTL_ARENAS_ALL) ".decay",
        purged according to a sigmoidal decay curve that starts and ends with
        zero purge rate.  A decay time of 0 causes all unused muzzy pages to be
        purged immediately upon creation.  A decay time of -1 disables purging.
-        The default decay time is 10 seconds.  See <link
+        Muzzy decay is disabled by default (with decay time 0).  See <link
        linkend="arenas.muzzy_decay_ms"><mallctl>arenas.muzzy_decay_ms</mallctl></link>
        and <link
        linkend="arena.i.muzzy_decay_ms"><mallctl>arena.&lt;i&gt;.muzzy_decay_ms</mallctl></link>
@ -1369,6 +1409,17 @@ malloc_conf = "xmalloc:true";]]></programlisting>
        extent hooks.</para></listitem>
      </varlistentry>

+      <varlistentry id="opt.prof_bt_max">
+        <term>
+          <mallctl>opt.prof_bt_max</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Maximum number of stack frames to record in profiling
+        backtraces.  The default is 128.</para></listitem>
+      </varlistentry>
+
      <varlistentry id="opt.prof">
        <term>
          <mallctl>opt.prof</mallctl>
@ -1474,6 +1525,23 @@ malloc_conf = "xmalloc:true";]]></programlisting>
        by default.</para></listitem>
      </varlistentry>

+      <varlistentry id="opt.prof_pid_namespace">
+        <term>
+          <mallctl>opt.prof_pid_namespace</mallctl>
+          (<type>bool</type>)
+          <literal>r-</literal>
+          [<option>--enable-prof</option>]
+        </term>
+        <listitem><para>Enable adding the pid namespace to the profile
+        filename. Profiles are dumped to files named according to the pattern
+        <filename>&lt;prefix&gt;.&lt;pid_namespace&gt;.&lt;pid&gt;.&lt;seq&gt;.i&lt;iseq&gt;.heap</filename>,
+        where <literal>&lt;prefix&gt;</literal> is controlled by the <link
+        linkend="opt.prof_prefix"><mallctl>opt.prof_prefix</mallctl></link> and
+        <link linkend="prof.prefix"><mallctl>prof.prefix</mallctl></link>
+        options.
+        </para></listitem>
+      </varlistentry>
+
      <varlistentry id="opt.lg_prof_interval">
        <term>
          <mallctl>opt.lg_prof_interval</mallctl>
@ -1599,6 +1667,53 @@ malloc_conf = "xmalloc:true";]]></programlisting>
 	testing this behavior.</para></listitem>
      </varlistentry>

+      <varlistentry id="opt.debug_double_free_max_scan">
+        <term>
+          <mallctl>opt.debug_double_free_max_scan</mallctl>
+          (<type>unsigned</type>)
+          <literal>r-</literal>
+          [<option>--enable-debug</option>]
+        </term>
+        <listitem><para>Maximum number of cached pointers to scan in the
+        thread cache when checking for double-free errors on deallocation.
+        When debug is enabled, each deallocation into the tcache scans up to
+        this many recently cached pointers to detect whether the same pointer
+        is being freed twice.  Setting this to 0 disables the check.  This
+        option is set to 0 and has no effect when debug is not enabled.  The
+        default is 32.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="opt.disable_large_size_classes">
+        <term>
+          <mallctl>opt.disable_large_size_classes</mallctl>
+          (<type>bool</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>When enabled (the default), large allocations
+        (i.e. allocations of size &gt;= <constant>SC_LARGE_MINCLASS</constant>)
+        are rounded up to the nearest page boundary rather than the nearest
+        large size class.  This minimizes memory overhead, especially when
+        using hugepages, at the cost of disabling the standard large size
+        class hierarchy.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="opt.process_madvise_max_batch">
+        <term>
+          <mallctl>opt.process_madvise_max_batch</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Maximum number of memory regions to include in each
+        <citerefentry><refentrytitle>process_madvise</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> batch call.  When set to 0
+        (the default), process_madvise is not used, and the standard
+        <citerefentry><refentrytitle>madvise</refentrytitle>
+        <manvolnum>2</manvolnum></citerefentry> is used instead.  Setting this
+        to a positive value enables batched purging via process_madvise, which
+        can reduce the number of system calls needed for
+        purging.</para></listitem>
+      </varlistentry>
+
      <varlistentry id="thread.arena">
        <term>
          <mallctl>thread.arena</mallctl>
@ -1735,6 +1850,47 @@ malloc_conf = "xmalloc:true";]]></programlisting>
        the developer may find manual flushing useful.</para></listitem>
      </varlistentry>

+      <varlistentry id="thread.tcache.max">
+        <term>
+          <mallctl>thread.tcache.max</mallctl>
+          (<type>size_t</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Get or set the maximum cached size class
+        (<varname>tcache_max</varname>) for the calling thread's tcache.  The
+        value is clamped to the maximum allowed limit and rounded up to the
+        nearest size class boundary.  Changing this value will resize the
+        thread cache accordingly.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.tcache.ncached_max.read_sizeclass">
+        <term>
+          <mallctl>thread.tcache.ncached_max.read_sizeclass</mallctl>
+          (<type>size_t</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Query the maximum number of cached objects
+        (<varname>ncached_max</varname>) for a given size class in the calling
+        thread's tcache.  The size class is passed in via
+        <parameter>newp</parameter>, and the corresponding
+        <varname>ncached_max</varname> is returned via
+        <parameter>oldp</parameter>.</para></listitem>
+      </varlistentry>
+
+      <varlistentry id="thread.tcache.ncached_max.write">
+        <term>
+          <mallctl>thread.tcache.ncached_max.write</mallctl>
+          (<type>char *</type>)
+          <literal>-w</literal>
+        </term>
+        <listitem><para>Set the maximum number of cached objects
+        (<varname>ncached_max</varname>) for size classes in the calling
+        thread's tcache.  The input is a string of pipe-separated settings,
+        where each setting specifies a size range and a count, in the same
+        format as the <mallctl>opt.tcache_ncached_max</mallctl> runtime
+        option.</para></listitem>
+      </varlistentry>
+
      <varlistentry id="thread.prof.name">
        <term>
          <mallctl>thread.prof.name</mallctl>
@ -1918,6 +2074,24 @@ malloc_conf = "xmalloc:true";]]></programlisting>
        linkend="thread.arena"><mallctl>thread.arena</mallctl></link>.</para></listitem>
      </varlistentry>

+      <varlistentry id="arena.i.name">
+        <term>
+          <mallctl>arena.&lt;i&gt;.name</mallctl>
+          (<type>char *</type>)
+          <literal>rw</literal>
+        </term>
+        <listitem><para>Get or set a descriptive name for arena &lt;i&gt;.
+        Arena names can be up to 32 characters long (including the null
+        terminator); longer names are truncated.  When reading, the caller
+        passes a pointer to a pre-allocated buffer (of at least 32 bytes) via
+        <parameter>oldp</parameter>, and
+        <parameter>*oldlenp</parameter> must be
+        <code language="C">sizeof(<type>char *</type>)</code>.
+        Arena names are also included in the output of <link
+        linkend="stats_print"><function>malloc_stats_print()</function></link>.
+        </para></listitem>
+      </varlistentry>
+
      <varlistentry id="arena.i.dss">
        <term>
          <mallctl>arena.&lt;i&gt;.dss</mallctl>
@ -2275,6 +2449,18 @@ struct extent_hooks_s {
        <listitem><para>Page size.</para></listitem>
      </varlistentry>

+      <varlistentry id="arenas.hugepage">
+        <term>
+          <mallctl>arenas.hugepage</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Hugepage size.  This value is also reported in the
+        output of <link
+        linkend="stats_print"><function>malloc_stats_print()</function></link>.
+        </para></listitem>
+      </varlistentry>
+
      <varlistentry id="arenas.tcache_max">
        <term>
          <mallctl>arenas.tcache_max</mallctl>
@ -2494,6 +2680,24 @@ struct extent_hooks_s {
        option for additional information.</para></listitem>
      </varlistentry>

+      <varlistentry id="approximate_stats.active">
+        <term>
+          <mallctl>approximate_stats.active</mallctl>
+          (<type>size_t</type>)
+          <literal>r-</literal>
+        </term>
+        <listitem><para>Return the total number of bytes in active pages
+        collected in an unsynchronized manner, without requiring an
+        <link linkend="epoch"><mallctl>epoch</mallctl></link> update.
+        As a result, this value should NOT be compared with other
+        stats.  For example, the relative ordering between
+        <mallctl>approximate_stats.active</mallctl> and <link
+        linkend="stats.active"><mallctl>stats.active</mallctl></link> or <link
+        linkend="stats.resident"><mallctl>stats.resident</mallctl></link> is
+        not guaranteed.  This interface is intended for lightweight monitoring
+        where an approximate value is sufficient.</para></listitem>
+      </varlistentry>
+
      <varlistentry id="stats.allocated">
        <term>
          <mallctl>stats.allocated</mallctl>
@ -3267,7 +3471,7 @@ struct extent_hooks_s {
        <listitem><para>Current number of nonfull slabs.</para></listitem>
      </varlistentry>

-      <varlistentry id="stats.arenas.i.bins.mutex">
+      <varlistentry id="stats.arenas.i.bins.j.mutex">
        <term>
          <mallctl>stats.arenas.&lt;i&gt;.bins.&lt;j&gt;.mutex.{counter}</mallctl>
          (<type>counter specific type</type>) <literal>r-</literal>
--- a/doc_internal/PROFILING_INTERNALS.md
+++ b/doc_internal/PROFILING_INTERNALS.md
@ -99,7 +99,25 @@ Using this approach means that there are a few things users need to be aware of.
 If one stack appears twice as often as another, this by itself does not imply that it allocates twice as often. Consider the case in which there are only two types of allocating call stacks in a program. Stack A allocates 8 bytes, and occurs a million times in a program. Stack B allocates 8 MB, and occurs just once in a program. If our sampling rate $R$ is about 1MB, we expect stack A to show up about 8 times, and stack B to show up once. Stack A isn't 8 times more frequent than stack B, though; it's a million times more frequent.

 ### Aggregation must be done after unbiasing samples
-Some tools manually parse heap dump output, and aggregate across stacks (or across program runs) to provide wider-scale data analyses. When doing this aggregation, though, it's important to unbias-and-then-sum, rather than sum-and-then-unbias. Reusing our example from the previous section: suppose we collect heap dumps of the program from a million machines. We then have 8 million occurs of stack A (each of 8 bytes), and a million occurrences of stack B (each of 8 MB). If we sum first, we'll attribute 64 MB to stack A, and 8 TB to stack B. Unbiasing changes these numbers by an infinitesimal amount, so that sum-then-unbias dramatically underreports the amount of memory allocated by stack A.
+Some tools manually parse heap dump output, and aggregate across stacks (or across program runs) to provide wider-scale data analyses. When doing this aggregation, though, it's important to unbias-and-then-sum, rather than sum-and-then-unbias. Reusing our example from the previous section: suppose we collect heap dumps of the program from 1 million machines. We then have 8 million samples of stack A (8 per machine, each of 8 bytes), and 1 million samples of stack B (1 per machine, each of 8 MB).
+
+If we sum first then unbias based on this formula: $1 - e^{-Z/R}$ we get:
+
+$$Z = 8,000,000 * 8 bytes = 64MB$$
+$$64MB / (1 - e^{-64MB/1MB}) \approx 64MB (Stack A)$$
+
+$$Z = 1,000,000 * 8MB = 8TB$$
+$$8TB / (1 - e^{-1TB/1MB}) \approx 8TB (Stack B)$$
+
+Clearly we are unbiasing by an infinitesimal amount, which dramatically underreports the amount of memory allocated by stack A. Whereas if we unbias first and then sum:
+
+$$Z = 8 bytes$$
+$$8 bytes / (1 - e^{-8 bytes/1MB}) \approx 1MB$$
+$$1MB * 8,000,000 = 8TB (Stack A)$$
+
+$$Z = 8MB$$
+$$8MB / (1 - e^{-8MB/1MB})  \approx 8MB$$
+$$8MB * 1,000,000 = 8TB (Stack B)$$

 ## An avenue for future exploration
 While the framework we laid out above is pretty general, as an engineering decision we're only interested in fairly simple approaches (i.e. ones for which the chance of an allocation being sampled depends only on its size). Our job is then: for each size class $Z$, pick a probability $p_Z$ that an allocation of that size will be sampled. We made some handwave-y references to statistical distributions to justify our choices, but there's no reason we need to pick them that way. Any set of non-zero probabilities is a valid choice.
--- a/include/jemalloc/internal/activity_callback.h
+++ b/include/jemalloc/internal/activity_callback.h
@ -1,23 +0,0 @@
-#ifndef JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H
-#define JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H
-
-/*
- * The callback to be executed "periodically", in response to some amount of
- * allocator activity.
- *
- * This callback need not be computing any sort of peak (although that's the
- * intended first use case), but we drive it from the peak counter, so it's
- * keeps things tidy to keep it here.
- *
- * The calls to this thunk get driven by the peak_event module.
- */
-#define ACTIVITY_CALLBACK_THUNK_INITIALIZER {NULL, NULL}
-typedef void (*activity_callback_t)(void *uctx, uint64_t allocated,
-    uint64_t deallocated);
-typedef struct activity_callback_thunk_s activity_callback_thunk_t;
-struct activity_callback_thunk_s {
-	activity_callback_t callback;
-	void *uctx;
-};
-
-#endif /* JEMALLOC_INTERNAL_ACTIVITY_CALLBACK_H */
--- a/include/jemalloc/internal/arena_externs.h
+++ b/include/jemalloc/internal/arena_externs.h
@ -1,8 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_EXTERNS_H
 #define JEMALLOC_INTERNAL_ARENA_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/bin.h"
 #include "jemalloc/internal/div.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/extent_dss.h"
 #include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/pages.h"
@ -18,104 +21,105 @@ extern ssize_t opt_dirty_decay_ms;
 extern ssize_t opt_muzzy_decay_ms;

 extern percpu_arena_mode_t opt_percpu_arena;
-extern const char *percpu_arena_mode_names[];
+extern const char *const   percpu_arena_mode_names[];

 extern div_info_t arena_binind_div_info[SC_NBINS];

-extern malloc_mutex_t arenas_lock;
 extern emap_t arena_emap_global;

 extern size_t opt_oversize_threshold;
 extern size_t oversize_threshold;

+extern bool      opt_huge_arena_pac_thp;
+extern pac_thp_t huge_arena_pac_thp;
+
 /*
 * arena_bin_offsets[binind] is the offset of the first bin shard for size class
 * binind.
 */
 extern uint32_t arena_bin_offsets[SC_NBINS];

-void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena,
-    unsigned *nthreads, const char **dss, ssize_t *dirty_decay_ms,
-    ssize_t *muzzy_decay_ms, size_t *nactive, size_t *ndirty, size_t *nmuzzy);
+void arena_basic_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
+    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
+    size_t *nactive, size_t *ndirty, size_t *nmuzzy);
 void arena_stats_merge(tsdn_t *tsdn, arena_t *arena, unsigned *nthreads,
    const char **dss, ssize_t *dirty_decay_ms, ssize_t *muzzy_decay_ms,
    size_t *nactive, size_t *ndirty, size_t *nmuzzy, arena_stats_t *astats,
-    bin_stats_data_t *bstats, arena_stats_large_t *lstats,
-    pac_estats_t *estats, hpa_shard_stats_t *hpastats, sec_stats_t *secstats);
+    bin_stats_data_t *bstats, arena_stats_large_t *lstats, pac_estats_t *estats,
+    hpa_shard_stats_t *hpastats);
 void arena_handle_deferred_work(tsdn_t *tsdn, arena_t *arena);
-edata_t *arena_extent_alloc_large(tsdn_t *tsdn, arena_t *arena,
-    size_t usize, size_t alignment, bool zero);
-void arena_extent_dalloc_large_prep(tsdn_t *tsdn, arena_t *arena,
-    edata_t *edata);
-void arena_extent_ralloc_large_shrink(tsdn_t *tsdn, arena_t *arena,
-    edata_t *edata, size_t oldsize);
-void arena_extent_ralloc_large_expand(tsdn_t *tsdn, arena_t *arena,
-    edata_t *edata, size_t oldsize);
-bool arena_decay_ms_set(tsdn_t *tsdn, arena_t *arena, extent_state_t state,
-    ssize_t decay_ms);
+edata_t *arena_extent_alloc_large(
+    tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
+void arena_extent_dalloc_large_prep(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata);
+void arena_extent_ralloc_large_shrink(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t oldusize);
+void arena_extent_ralloc_large_expand(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t oldusize);
+bool arena_decay_ms_set(
+    tsdn_t *tsdn, arena_t *arena, extent_state_t state, ssize_t decay_ms);
 ssize_t arena_decay_ms_get(arena_t *arena, extent_state_t state);
-void arena_decay(tsdn_t *tsdn, arena_t *arena, bool is_background_thread,
-    bool all);
-uint64_t arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
-void arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
-void arena_reset(tsd_t *tsd, arena_t *arena);
-void arena_destroy(tsd_t *tsd, arena_t *arena);
-void arena_cache_bin_fill_small(tsdn_t *tsdn, arena_t *arena,
-    cache_bin_t *cache_bin, cache_bin_info_t *cache_bin_info, szind_t binind,
-    const unsigned nfill);
+void    arena_decay(
+       tsdn_t *tsdn, arena_t *arena, bool is_background_thread, bool all);
+uint64_t       arena_time_until_deferred(tsdn_t *tsdn, arena_t *arena);
+void           arena_do_deferred_work(tsdn_t *tsdn, arena_t *arena);
+void           arena_reset(tsd_t *tsd, arena_t *arena);
+void           arena_destroy(tsd_t *tsd, arena_t *arena);
+cache_bin_sz_t arena_ptr_array_fill_small(tsdn_t *tsdn, arena_t *arena,
+    szind_t binind, cache_bin_ptr_array_t *arr, const cache_bin_sz_t nfill_min,
+    const cache_bin_sz_t nfill_max, cache_bin_stats_t merge_stats);

-void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size,
-    szind_t ind, bool zero);
-void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize,
-    size_t alignment, bool zero, tcache_t *tcache);
-void arena_prof_promote(tsdn_t *tsdn, void *ptr, size_t usize);
-void arena_dalloc_promoted(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
-    bool slow_path);
+void *arena_malloc_hard(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind,
+    bool zero, bool slab);
+void *arena_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
+    bool zero, bool slab, tcache_t *tcache);
+void  arena_prof_promote(
+     tsdn_t *tsdn, void *ptr, size_t usize, size_t bumped_usize);
+void arena_dalloc_promoted(
+    tsdn_t *tsdn, void *ptr, tcache_t *tcache, bool slow_path);
 void arena_slab_dalloc(tsdn_t *tsdn, arena_t *arena, edata_t *slab);

-void arena_dalloc_bin_locked_handle_newly_empty(tsdn_t *tsdn, arena_t *arena,
-    edata_t *slab, bin_t *bin);
-void arena_dalloc_bin_locked_handle_newly_nonempty(tsdn_t *tsdn, arena_t *arena,
-    edata_t *slab, bin_t *bin);
-void arena_dalloc_small(tsdn_t *tsdn, void *ptr);
-bool arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t extra, bool zero, size_t *newsize);
+void  arena_dalloc_small(tsdn_t *tsdn, void *ptr);
+void  arena_ptr_array_flush(tsd_t *tsd, szind_t binind,
+     cache_bin_ptr_array_t *arr, unsigned nflush, bool small,
+     arena_t *stats_arena, cache_bin_stats_t merge_stats);
+bool  arena_ralloc_no_move(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+     size_t extra, bool zero, size_t *newsize);
 void *arena_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t oldsize,
-    size_t size, size_t alignment, bool zero, tcache_t *tcache,
+    size_t size, size_t alignment, bool zero, bool slab, tcache_t *tcache,
    hook_ralloc_args_t *hook_args);
-dss_prec_t arena_dss_prec_get(arena_t *arena);
-ehooks_t *arena_get_ehooks(arena_t *arena);
-extent_hooks_t *arena_set_extent_hooks(tsd_t *tsd, arena_t *arena,
-    extent_hooks_t *extent_hooks);
-bool arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+dss_prec_t      arena_dss_prec_get(arena_t *arena);
+ehooks_t       *arena_get_ehooks(arena_t *arena);
+extent_hooks_t *arena_set_extent_hooks(
+    tsd_t *tsd, arena_t *arena, extent_hooks_t *extent_hooks);
+bool    arena_dss_prec_set(arena_t *arena, dss_prec_t dss_prec);
+void    arena_name_get(arena_t *arena, char *name);
+void    arena_name_set(arena_t *arena, const char *name);
 ssize_t arena_dirty_decay_ms_default_get(void);
-bool arena_dirty_decay_ms_default_set(ssize_t decay_ms);
+bool    arena_dirty_decay_ms_default_set(ssize_t decay_ms);
 ssize_t arena_muzzy_decay_ms_default_get(void);
-bool arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
-bool arena_retain_grow_limit_get_set(tsd_t *tsd, arena_t *arena,
-    size_t *old_limit, size_t *new_limit);
+bool    arena_muzzy_decay_ms_default_set(ssize_t decay_ms);
+bool    arena_retain_grow_limit_get_set(
+       tsd_t *tsd, arena_t *arena, size_t *old_limit, size_t *new_limit);
 unsigned arena_nthreads_get(arena_t *arena, bool internal);
-void arena_nthreads_inc(arena_t *arena, bool internal);
-void arena_nthreads_dec(arena_t *arena, bool internal);
+void     arena_nthreads_inc(arena_t *arena, bool internal);
+void     arena_nthreads_dec(arena_t *arena, bool internal);
 arena_t *arena_new(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
-bool arena_init_huge(void);
-bool arena_is_huge(unsigned arena_ind);
+bool     arena_init_huge(tsdn_t *tsdn, arena_t *a0);
 arena_t *arena_choose_huge(tsd_t *tsd);
-bin_t *arena_bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind,
-    unsigned *binshard);
 size_t arena_fill_small_fresh(tsdn_t *tsdn, arena_t *arena, szind_t binind,
    void **ptrs, size_t nfill, bool zero);
-bool arena_boot(sc_data_t *sc_data, base_t *base, bool hpa);
-void arena_prefork0(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork1(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork2(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork3(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork4(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork5(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork6(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork7(tsdn_t *tsdn, arena_t *arena);
-void arena_prefork8(tsdn_t *tsdn, arena_t *arena);
-void arena_postfork_parent(tsdn_t *tsdn, arena_t *arena);
-void arena_postfork_child(tsdn_t *tsdn, arena_t *arena);
+bool   arena_boot(sc_data_t *sc_data, base_t *base, bool hpa);
+void   arena_prefork0(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork1(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork2(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork3(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork4(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork5(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork6(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork7(tsdn_t *tsdn, arena_t *arena);
+void   arena_prefork8(tsdn_t *tsdn, arena_t *arena);
+void   arena_postfork_parent(tsdn_t *tsdn, arena_t *arena);
+void   arena_postfork_child(tsdn_t *tsdn, arena_t *arena);

 #endif /* JEMALLOC_INTERNAL_ARENA_EXTERNS_H */
--- a/include/jemalloc/internal/arena_inlines_a.h
+++ b/include/jemalloc/internal/arena_inlines_a.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_A_H
 #define JEMALLOC_INTERNAL_ARENA_INLINES_A_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_structs.h"
+
 static inline unsigned
 arena_ind_get(const arena_t *arena) {
 	return arena->ind;
--- a/include/jemalloc/internal/arena_inlines_b.h
+++ b/include/jemalloc/internal/arena_inlines_b.h
@ -1,20 +1,29 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_INLINES_B_H
 #define JEMALLOC_INTERNAL_ARENA_INLINES_B_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_structs.h"
+#include "jemalloc/internal/bin_inlines.h"
 #include "jemalloc/internal/div.h"
 #include "jemalloc/internal/emap.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_b.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/large_externs.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/prof_externs.h"
+#include "jemalloc/internal/prof_structs.h"
 #include "jemalloc/internal/rtree.h"
 #include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/sz.h"
+#include "jemalloc/internal/tcache_inlines.h"
 #include "jemalloc/internal/ticker.h"

 static inline arena_t *
 arena_get_from_edata(edata_t *edata) {
-	return (arena_t *)atomic_load_p(&arenas[edata_arena_ind_get(edata)],
-	    ATOMIC_RELAXED);
+	return (arena_t *)atomic_load_p(
+	    &arenas[edata_arena_ind_get(edata)], ATOMIC_RELAXED);
 }

 JEMALLOC_ALWAYS_INLINE arena_t *
@ -28,14 +37,48 @@ arena_choose_maybe_huge(tsd_t *tsd, arena_t *arena, size_t size) {
 	 * 1) is using auto arena selection (i.e. arena == NULL), and 2) the
 	 * thread is not assigned to a manual arena.
 	 */
-	if (unlikely(size >= oversize_threshold)) {
-		arena_t *tsd_arena = tsd_arena_get(tsd);
-		if (tsd_arena == NULL || arena_is_auto(tsd_arena)) {
-			return arena_choose_huge(tsd);
-		}
+	arena_t *tsd_arena = tsd_arena_get(tsd);
+	if (tsd_arena == NULL) {
+		tsd_arena = arena_choose(tsd, NULL);
 	}

-	return arena_choose(tsd, NULL);
+	size_t threshold = atomic_load_zu(
+	    &tsd_arena->pa_shard.pac.oversize_threshold, ATOMIC_RELAXED);
+	if (unlikely(size >= threshold) && arena_is_auto(tsd_arena)) {
+		return arena_choose_huge(tsd);
+	}
+
+	return tsd_arena;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+large_dalloc_safety_checks(edata_t *edata, const void *ptr, size_t input_size) {
+	if (!config_opt_safety_checks) {
+		return false;
+	}
+
+	/*
+	 * Eagerly detect double free and sized dealloc bugs for large sizes.
+	 * The cost is low enough (as edata will be accessed anyway) to be
+	 * enabled all the time.
+	 */
+	if (unlikely(edata == NULL
+	        || edata_state_get(edata) != extent_state_active)) {
+		safety_check_fail(
+		    "Invalid deallocation detected: "
+		    "pages being freed (%p) not currently active, "
+		    "possibly caused by double free bugs.",
+		    ptr);
+		return true;
+	}
+	if (unlikely(input_size != edata_usize_get(edata)
+	        || input_size > SC_LARGE_MAXCLASS)) {
+		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
+		    /* true_size */ edata_usize_get(edata), input_size);
+		return true;
+	}
+
+	return false;
 }

 JEMALLOC_ALWAYS_INLINE void
@ -46,48 +89,56 @@ arena_prof_info_get(tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx,
 	assert(prof_info != NULL);

 	edata_t *edata = NULL;
-	bool is_slab;
+	bool     is_slab;

 	/* Static check. */
 	if (alloc_ctx == NULL) {
-		edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
-		    ptr);
+		edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
 		is_slab = edata_slab_get(edata);
 	} else if (unlikely(!(is_slab = alloc_ctx->slab))) {
-		edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
-		    ptr);
+		edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
 	}

 	if (unlikely(!is_slab)) {
 		/* edata must have been initialized at this point. */
 		assert(edata != NULL);
+		size_t usize = (alloc_ctx == NULL)
+		    ? edata_usize_get(edata)
+		    : emap_alloc_ctx_usize_get(alloc_ctx);
+		if (reset_recent
+		    && large_dalloc_safety_checks(edata, ptr, usize)) {
+			prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
+			return;
+		}
 		large_prof_info_get(tsd, edata, prof_info, reset_recent);
 	} else {
-		prof_info->alloc_tctx = (prof_tctx_t *)(uintptr_t)1U;
+		prof_info->alloc_tctx = PROF_TCTX_SENTINEL;
 		/*
 		 * No need to set other fields in prof_info; they will never be
-		 * accessed if (uintptr_t)alloc_tctx == (uintptr_t)1U.
+		 * accessed if alloc_tctx == PROF_TCTX_SENTINEL.
 		 */
 	}
 }

 JEMALLOC_ALWAYS_INLINE void
-arena_prof_tctx_reset(tsd_t *tsd, const void *ptr,
-    emap_alloc_ctx_t *alloc_ctx) {
+arena_prof_tctx_reset(
+    tsd_t *tsd, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
 	cassert(config_prof);
 	assert(ptr != NULL);

 	/* Static check. */
 	if (alloc_ctx == NULL) {
-		edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd),
-		    &arena_emap_global, ptr);
+		edata_t *edata = emap_edata_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr);
 		if (unlikely(!edata_slab_get(edata))) {
 			large_prof_tctx_reset(edata);
 		}
 	} else {
 		if (unlikely(!alloc_ctx->slab)) {
-			edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd),
-			    &arena_emap_global, ptr);
+			edata_t *edata = emap_edata_lookup(
+			    tsd_tsdn(tsd), &arena_emap_global, ptr);
 			large_prof_tctx_reset(edata);
 		}
 	}
@ -98,16 +149,16 @@ arena_prof_tctx_reset_sampled(tsd_t *tsd, const void *ptr) {
 	cassert(config_prof);
 	assert(ptr != NULL);

-	edata_t *edata = emap_edata_lookup(tsd_tsdn(tsd), &arena_emap_global,
-	    ptr);
+	edata_t *edata = emap_edata_lookup(
+	    tsd_tsdn(tsd), &arena_emap_global, ptr);
 	assert(!edata_slab_get(edata));

 	large_prof_tctx_reset(edata);
 }

 JEMALLOC_ALWAYS_INLINE void
-arena_prof_info_set(tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx,
-    size_t size) {
+arena_prof_info_set(
+    tsd_t *tsd, edata_t *edata, prof_tctx_t *tctx, size_t size) {
 	cassert(config_prof);

 	assert(!edata_slab_get(edata));
@ -130,8 +181,9 @@ arena_decay_ticks(tsdn_t *tsdn, arena_t *arena, unsigned nticks) {
 	 * use a single ticker for all of them.
 	 */
 	ticker_geom_t *decay_ticker = tsd_arena_decay_tickerp_get(tsd);
-	uint64_t *prng_state = tsd_prng_statep_get(tsd);
-	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks))) {
+	uint64_t      *prng_state = tsd_prng_statep_get(tsd);
+	if (unlikely(ticker_geom_ticks(decay_ticker, prng_state, nticks,
+	        tsd_reentrancy_level_get(tsd) > 0))) {
 		arena_decay(tsdn, arena, false, false);
 	}
 }
@ -143,23 +195,24 @@ arena_decay_tick(tsdn_t *tsdn, arena_t *arena) {

 JEMALLOC_ALWAYS_INLINE void *
 arena_malloc(tsdn_t *tsdn, arena_t *arena, size_t size, szind_t ind, bool zero,
-    tcache_t *tcache, bool slow_path) {
+    bool slab, tcache_t *tcache, bool slow_path) {
 	assert(!tsdn_null(tsdn) || tcache == NULL);

 	if (likely(tcache != NULL)) {
-		if (likely(size <= SC_SMALL_MAXCLASS)) {
-			return tcache_alloc_small(tsdn_tsd(tsdn), arena,
-			    tcache, size, ind, zero, slow_path);
+		if (likely(slab)) {
+			assert(sz_can_use_slab(size));
+			return tcache_alloc_small(tsdn_tsd(tsdn), arena, tcache,
+			    size, ind, zero, slow_path);
+		} else if (likely(ind < tcache_nbins_get(tcache->tcache_slow)
+		               && !tcache_bin_disabled(ind, &tcache->bins[ind],
+		                   tcache->tcache_slow))) {
+			return tcache_alloc_large(tsdn_tsd(tsdn), arena, tcache,
+			    size, ind, zero, slow_path);
 		}
-		if (likely(size <= tcache_maxclass)) {
-			return tcache_alloc_large(tsdn_tsd(tsdn), arena,
-			    tcache, size, ind, zero, slow_path);
-		}
-		/* (size > tcache_maxclass) case falls through. */
-		assert(size > tcache_maxclass);
+		/* (size > tcache_max) case falls through. */
 	}

-	return arena_malloc_hard(tsdn, arena, size, ind, zero);
+	return arena_malloc_hard(tsdn, arena, size, ind, zero, slab);
 }

 JEMALLOC_ALWAYS_INLINE arena_t *
@ -176,7 +229,7 @@ arena_salloc(tsdn_t *tsdn, const void *ptr) {
 	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);
 	assert(alloc_ctx.szind != SC_NSIZES);

-	return sz_index2size(alloc_ctx.szind);
+	return emap_alloc_ctx_usize_get(&alloc_ctx);
 }

 JEMALLOC_ALWAYS_INLINE size_t
@ -191,8 +244,8 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {
 	 */

 	emap_full_alloc_ctx_t full_alloc_ctx;
-	bool missing = emap_full_alloc_ctx_try_lookup(tsdn, &arena_emap_global,
-	    ptr, &full_alloc_ctx);
+	bool                  missing = emap_full_alloc_ctx_try_lookup(
+            tsdn, &arena_emap_global, ptr, &full_alloc_ctx);
 	if (missing) {
 		return 0;
 	}
@ -207,46 +260,24 @@ arena_vsalloc(tsdn_t *tsdn, const void *ptr) {

 	assert(full_alloc_ctx.szind != SC_NSIZES);

-	return sz_index2size(full_alloc_ctx.szind);
-}
-
-JEMALLOC_ALWAYS_INLINE bool
-large_dalloc_safety_checks(edata_t *edata, void *ptr, szind_t szind) {
-	if (!config_opt_safety_checks) {
-		return false;
-	}
-
-	/*
-	 * Eagerly detect double free and sized dealloc bugs for large sizes.
-	 * The cost is low enough (as edata will be accessed anyway) to be
-	 * enabled all the time.
-	 */
-	if (unlikely(edata == NULL ||
-	    edata_state_get(edata) != extent_state_active)) {
-		safety_check_fail("Invalid deallocation detected: "
-		    "pages being freed (%p) not currently active, "
-		    "possibly caused by double free bugs.",
-		    (uintptr_t)edata_addr_get(edata));
-		return true;
-	}
-	size_t input_size = sz_index2size(szind);
-	if (unlikely(input_size != edata_usize_get(edata))) {
-		safety_check_fail_sized_dealloc(/* current_dealloc */ true, ptr,
-		    /* true_size */ edata_usize_get(edata), input_size);
-		return true;
-	}
-
-	return false;
+	return edata_usize_get(full_alloc_ctx.edata);
 }

 static inline void
-arena_dalloc_large_no_tcache(tsdn_t *tsdn, void *ptr, szind_t szind) {
+arena_dalloc_large_no_tcache(
+    tsdn_t *tsdn, void *ptr, szind_t szind, size_t usize) {
+	/*
+	 * szind is still needed in this function mainly becuase
+	 * szind < SC_NBINS determines not only if this is a small alloc,
+	 * but also if szind is valid (an inactive extent would have
+	 * szind == SC_NSIZES).
+	 */
 	if (config_prof && unlikely(szind < SC_NBINS)) {
 		arena_dalloc_promoted(tsdn, ptr, NULL, true);
 	} else {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
-		if (large_dalloc_safety_checks(edata, ptr, szind)) {
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
+		if (large_dalloc_safety_checks(edata, ptr, usize)) {
 			/* See the comment in isfree. */
 			return;
 		}
@ -262,42 +293,76 @@ arena_dalloc_no_tcache(tsdn_t *tsdn, void *ptr) {
 	emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr, &alloc_ctx);

 	if (config_debug) {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }

 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc_large(tsdn_t *tsdn, void *ptr, tcache_t *tcache, szind_t szind,
-    bool slow_path) {
-	if (szind < nhbins) {
-		if (config_prof && unlikely(szind < SC_NBINS)) {
-			arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
-		} else {
-			tcache_dalloc_large(tsdn_tsd(tsdn), tcache, ptr, szind,
-			    slow_path);
-		}
+    size_t usize, bool slow_path) {
+	assert(!tsdn_null(tsdn) && tcache != NULL);
+	bool is_sample_promoted = config_prof && szind < SC_NBINS;
+	if (unlikely(is_sample_promoted)) {
+		arena_dalloc_promoted(tsdn, ptr, tcache, slow_path);
 	} else {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
-		if (large_dalloc_safety_checks(edata, ptr, szind)) {
-			/* See the comment in isfree. */
-			return;
+		if (szind < tcache_nbins_get(tcache->tcache_slow)
+		    && !tcache_bin_disabled(
+		        szind, &tcache->bins[szind], tcache->tcache_slow)) {
+			tcache_dalloc_large(
+			    tsdn_tsd(tsdn), tcache, ptr, szind, slow_path);
+		} else {
+			edata_t *edata = emap_edata_lookup(
+			    tsdn, &arena_emap_global, ptr);
+			if (large_dalloc_safety_checks(edata, ptr, usize)) {
+				/* See the comment in isfree. */
+				return;
+			}
+			large_dalloc(tsdn, edata);
 		}
-		large_dalloc(tsdn, edata);
 	}
 }

+JEMALLOC_ALWAYS_INLINE bool
+arena_tcache_dalloc_small_safety_check(tsdn_t *tsdn, void *ptr) {
+	if (!config_debug) {
+		return false;
+	}
+	edata_t   *edata = emap_edata_lookup(tsdn, &arena_emap_global, ptr);
+	szind_t    binind = edata_szind_get(edata);
+	div_info_t div_info = arena_binind_div_info[binind];
+	/*
+	 * Calls the internal function bin_slab_regind_impl because the
+	 * safety check does not require a lock.
+	 */
+	size_t regind = bin_slab_regind_impl(&div_info, binind, edata, ptr);
+	slab_data_t      *slab_data = edata_slab_data_get(edata);
+	const bin_info_t *bin_info = &bin_infos[binind];
+	assert(edata_nfree_get(edata) < bin_info->nregs);
+	if (unlikely(!bitmap_get(
+	        slab_data->bitmap, &bin_info->bitmap_info, regind))) {
+		safety_check_fail(
+		    "Invalid deallocation detected: the pointer being freed (%p) not "
+		    "currently active, possibly caused by double free bugs.\n",
+		    ptr);
+		return true;
+	}
+	return false;
+}
+
 JEMALLOC_ALWAYS_INLINE void
 arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
    emap_alloc_ctx_t *caller_alloc_ctx, bool slow_path) {
@ -313,26 +378,31 @@ arena_dalloc(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	if (caller_alloc_ctx != NULL) {
 		alloc_ctx = *caller_alloc_ctx;
 	} else {
-		util_assume(!tsdn_null(tsdn));
-		emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
-		    &alloc_ctx);
+		util_assume(tsdn != NULL);
+		emap_alloc_ctx_lookup(
+		    tsdn, &arena_emap_global, ptr, &alloc_ctx);
 	}

 	if (config_debug) {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.szind < SC_NSIZES);
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
-		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
-		    alloc_ctx.szind, slow_path);
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
+		tcache_dalloc_small(
+		    tsdn_tsd(tsdn), tcache, ptr, alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    emap_alloc_ctx_usize_get(&alloc_ctx), slow_path);
 	}
 }

@ -347,21 +417,22 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		 * There is no risk of being confused by a promoted sampled
 		 * object, so base szind and slab on the given size.
 		 */
-		alloc_ctx.szind = sz_size2index(size);
-		alloc_ctx.slab = (alloc_ctx.szind < SC_NBINS);
+		szind_t szind = sz_size2index(size);
+		emap_alloc_ctx_init(
+		    &alloc_ctx, szind, (szind < SC_NBINS), size);
 	}

 	if ((config_prof && opt_prof) || config_debug) {
-		emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
-		    &alloc_ctx);
+		emap_alloc_ctx_lookup(
+		    tsdn, &arena_emap_global, ptr, &alloc_ctx);

 		assert(alloc_ctx.szind == sz_size2index(size));
 		assert((config_prof && opt_prof)
 		    || alloc_ctx.slab == (alloc_ctx.szind < SC_NBINS));

 		if (config_debug) {
-			edata_t *edata = emap_edata_lookup(tsdn,
-			    &arena_emap_global, ptr);
+			edata_t *edata = emap_edata_lookup(
+			    tsdn, &arena_emap_global, ptr);
 			assert(alloc_ctx.szind == edata_szind_get(edata));
 			assert(alloc_ctx.slab == edata_slab_get(edata));
 		}
@ -371,7 +442,8 @@ arena_sdalloc_no_tcache(tsdn_t *tsdn, void *ptr, size_t size) {
 		/* Small allocation. */
 		arena_dalloc_small(tsdn, ptr);
 	} else {
-		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind);
+		arena_dalloc_large_no_tcache(tsdn, ptr, alloc_ctx.szind,
+		    emap_alloc_ctx_usize_get(&alloc_ctx));
 	}
 }

@ -391,9 +463,10 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 	if (config_prof && opt_prof) {
 		if (caller_alloc_ctx == NULL) {
 			/* Uncommon case and should be a static check. */
-			emap_alloc_ctx_lookup(tsdn, &arena_emap_global, ptr,
-			    &alloc_ctx);
+			emap_alloc_ctx_lookup(
+			    tsdn, &arena_emap_global, ptr, &alloc_ctx);
 			assert(alloc_ctx.szind == sz_size2index(size));
+			assert(emap_alloc_ctx_usize_get(&alloc_ctx) == size);
 		} else {
 			alloc_ctx = *caller_alloc_ctx;
 		}
@ -407,30 +480,37 @@ arena_sdalloc(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
 	}

 	if (config_debug) {
-		edata_t *edata = emap_edata_lookup(tsdn, &arena_emap_global,
-		    ptr);
+		edata_t *edata = emap_edata_lookup(
+		    tsdn, &arena_emap_global, ptr);
 		assert(alloc_ctx.szind == edata_szind_get(edata));
 		assert(alloc_ctx.slab == edata_slab_get(edata));
+		emap_alloc_ctx_init(
+		    &alloc_ctx, alloc_ctx.szind, alloc_ctx.slab, sz_s2u(size));
+		assert(emap_alloc_ctx_usize_get(&alloc_ctx)
+		    == edata_usize_get(edata));
 	}

 	if (likely(alloc_ctx.slab)) {
 		/* Small allocation. */
-		tcache_dalloc_small(tsdn_tsd(tsdn), tcache, ptr,
-		    alloc_ctx.szind, slow_path);
+		if (arena_tcache_dalloc_small_safety_check(tsdn, ptr)) {
+			return;
+		}
+		tcache_dalloc_small(
+		    tsdn_tsd(tsdn), tcache, ptr, alloc_ctx.szind, slow_path);
 	} else {
 		arena_dalloc_large(tsdn, ptr, tcache, alloc_ctx.szind,
-		    slow_path);
+		    sz_s2u(size), slow_path);
 	}
 }

 static inline void
-arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
-    size_t alignment) {
+arena_cache_oblivious_randomize(
+    tsdn_t *tsdn, arena_t *arena, edata_t *edata, size_t alignment) {
 	assert(edata_base_get(edata) == edata_addr_get(edata));

 	if (alignment < PAGE) {
-		unsigned lg_range = LG_PAGE -
-		    lg_floor(CACHELINE_CEILING(alignment));
+		unsigned lg_range = LG_PAGE
+		    - lg_floor(CACHELINE_CEILING(alignment));
 		size_t r;
 		if (!tsdn_null(tsdn)) {
 			tsd_t *tsd = tsdn_tsd(tsdn);
@ -440,110 +520,18 @@ arena_cache_oblivious_randomize(tsdn_t *tsdn, arena_t *arena, edata_t *edata,
 			uint64_t stack_value = (uint64_t)(uintptr_t)&r;
 			r = (size_t)prng_lg_range_u64(&stack_value, lg_range);
 		}
-		uintptr_t random_offset = ((uintptr_t)r) << (LG_PAGE -
-		    lg_range);
-		edata->e_addr = (void *)((uintptr_t)edata->e_addr +
-		    random_offset);
-		assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment) ==
-		    edata->e_addr);
-	}
-}
-
-/*
- * The dalloc bin info contains just the information that the common paths need
- * during tcache flushes.  By force-inlining these paths, and using local copies
- * of data (so that the compiler knows it's constant), we avoid a whole bunch of
- * redundant loads and stores by leaving this information in registers.
- */
-typedef struct arena_dalloc_bin_locked_info_s arena_dalloc_bin_locked_info_t;
-struct arena_dalloc_bin_locked_info_s {
-	div_info_t div_info;
-	uint32_t nregs;
-	uint64_t ndalloc;
-};
-
-JEMALLOC_ALWAYS_INLINE size_t
-arena_slab_regind(arena_dalloc_bin_locked_info_t *info, szind_t binind,
-    edata_t *slab, const void *ptr) {
-	size_t diff, regind;
-
-	/* Freeing a pointer outside the slab can cause assertion failure. */
-	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
-	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
-	/* Freeing an interior pointer can cause assertion failure. */
-	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab)) %
-	    (uintptr_t)bin_infos[binind].reg_size == 0);
-
-	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
-
-	/* Avoid doing division with a variable divisor. */
-	regind = div_compute(&info->div_info, diff);
-
-	assert(regind < bin_infos[binind].nregs);
-
-	return regind;
-}
-
-JEMALLOC_ALWAYS_INLINE void
-arena_dalloc_bin_locked_begin(arena_dalloc_bin_locked_info_t *info,
-    szind_t binind) {
-	info->div_info = arena_binind_div_info[binind];
-	info->nregs = bin_infos[binind].nregs;
-	info->ndalloc = 0;
-}
-
-/*
- * Does the deallocation work associated with freeing a single pointer (a
- * "step") in between a arena_dalloc_bin_locked begin and end call.
- *
- * Returns true if arena_slab_dalloc must be called on slab.  Doesn't do
- * stats updates, which happen during finish (this lets running counts get left
- * in a register).
- */
-JEMALLOC_ALWAYS_INLINE bool
-arena_dalloc_bin_locked_step(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    arena_dalloc_bin_locked_info_t *info, szind_t binind, edata_t *slab,
-    void *ptr) {
-	const bin_info_t *bin_info = &bin_infos[binind];
-	size_t regind = arena_slab_regind(info, binind, slab, ptr);
-	slab_data_t *slab_data = edata_slab_data_get(slab);
-
-	assert(edata_nfree_get(slab) < bin_info->nregs);
-	/* Freeing an unallocated pointer can cause assertion failure. */
-	assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
-
-	bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
-	edata_nfree_inc(slab);
-
-	if (config_stats) {
-		info->ndalloc++;
-	}
-
-	unsigned nfree = edata_nfree_get(slab);
-	if (nfree == bin_info->nregs) {
-		arena_dalloc_bin_locked_handle_newly_empty(tsdn, arena, slab,
-		    bin);
-		return true;
-	} else if (nfree == 1 && slab != bin->slabcur) {
-		arena_dalloc_bin_locked_handle_newly_nonempty(tsdn, arena, slab,
-		    bin);
-	}
-	return false;
-}
-
-JEMALLOC_ALWAYS_INLINE void
-arena_dalloc_bin_locked_finish(tsdn_t *tsdn, arena_t *arena, bin_t *bin,
-    arena_dalloc_bin_locked_info_t *info) {
-	if (config_stats) {
-		bin->stats.ndalloc += info->ndalloc;
-		assert(bin->stats.curregs >= (size_t)info->ndalloc);
-		bin->stats.curregs -= (size_t)info->ndalloc;
+		uintptr_t random_offset = ((uintptr_t)r)
+		    << (LG_PAGE - lg_range);
+		edata->e_addr = (void *)((byte_t *)edata->e_addr
+		    + random_offset);
+		assert(ALIGNMENT_ADDR2BASE(edata->e_addr, alignment)
+		    == edata->e_addr);
 	}
 }

 static inline bin_t *
 arena_get_bin(arena_t *arena, szind_t binind, unsigned binshard) {
-	bin_t *shard0 = (bin_t *)((uintptr_t)arena + arena_bin_offsets[binind]);
+	bin_t *shard0 = (bin_t *)((byte_t *)arena + arena_bin_offsets[binind]);
 	return shard0 + binshard;
 }

--- a/include/jemalloc/internal/arena_stats.h
+++ b/include/jemalloc/internal/arena_stats.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STATS_H
 #define JEMALLOC_INTERNAL_ARENA_STATS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/mutex.h"
@ -13,28 +14,34 @@ JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 typedef struct arena_stats_large_s arena_stats_large_t;
 struct arena_stats_large_s {
 	/*
-	 * Total number of allocation/deallocation requests served directly by
-	 * the arena.
+	 * Total number of large allocation/deallocation requests served directly
+	 * by the arena.
 	 */
-	locked_u64_t	nmalloc;
-	locked_u64_t	ndalloc;
+	locked_u64_t nmalloc;
+	locked_u64_t ndalloc;
+
+	/*
+	 * Total large active bytes (allocated - deallocated) served directly
+	 * by the arena.
+	 */
+	locked_u64_t active_bytes;

 	/*
 	 * Number of allocation requests that correspond to this size class.
 	 * This includes requests served by tcache, though tcache only
 	 * periodically merges into this counter.
 	 */
-	locked_u64_t	nrequests; /* Partially derived. */
+	locked_u64_t nrequests; /* Partially derived. */
 	/*
 	 * Number of tcache fills / flushes for large (similarly, periodically
 	 * merged).  Note that there is no large tcache batch-fill currently
 	 * (i.e. only fill 1 at a time); however flush may be batched.
 	 */
-	locked_u64_t	nfills; /* Partially derived. */
-	locked_u64_t	nflushes; /* Partially derived. */
+	locked_u64_t nfills;   /* Partially derived. */
+	locked_u64_t nflushes; /* Partially derived. */

 	/* Current number of allocations of this size class. */
-	size_t		curlextents; /* Derived. */
+	size_t curlextents; /* Derived. */
 };

 /*
@ -50,38 +57,40 @@ struct arena_stats_s {
 	 * resident includes the base stats -- that's why it lives here and not
 	 * in pa_shard_stats_t.
 	 */
-	size_t			base; /* Derived. */
-	size_t			resident; /* Derived. */
-	size_t			metadata_thp; /* Derived. */
-	size_t			mapped; /* Derived. */
+	size_t base;           /* Derived. */
+	size_t metadata_edata; /* Derived. */
+	size_t metadata_rtree; /* Derived. */
+	size_t resident;       /* Derived. */
+	size_t metadata_thp;   /* Derived. */
+	size_t mapped;         /* Derived. */

-	atomic_zu_t		internal;
+	atomic_zu_t internal;

-	size_t			allocated_large; /* Derived. */
-	uint64_t		nmalloc_large; /* Derived. */
-	uint64_t		ndalloc_large; /* Derived. */
-	uint64_t		nfills_large; /* Derived. */
-	uint64_t		nflushes_large; /* Derived. */
-	uint64_t		nrequests_large; /* Derived. */
+	size_t   allocated_large; /* Derived. */
+	uint64_t nmalloc_large;   /* Derived. */
+	uint64_t ndalloc_large;   /* Derived. */
+	uint64_t nfills_large;    /* Derived. */
+	uint64_t nflushes_large;  /* Derived. */
+	uint64_t nrequests_large; /* Derived. */

 	/*
 	 * The stats logically owned by the pa_shard in the same arena.  This
 	 * lives here only because it's convenient for the purposes of the ctl
 	 * module -- it only knows about the single arena_stats.
 	 */
-	pa_shard_stats_t	pa_shard_stats;
+	pa_shard_stats_t pa_shard_stats;

 	/* Number of bytes cached in tcache associated with this arena. */
-	size_t			tcache_bytes; /* Derived. */
-	size_t			tcache_stashed_bytes; /* Derived. */
+	size_t tcache_bytes;         /* Derived. */
+	size_t tcache_stashed_bytes; /* Derived. */

 	mutex_prof_data_t mutex_prof_data[mutex_prof_num_arena_mutexes];

 	/* One element for each large size class. */
-	arena_stats_large_t	lstats[SC_NSIZES - SC_NBINS];
+	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];

 	/* Arena uptime. */
-	nstime_t		uptime;
+	nstime_t uptime;
 };

 static inline bool
@ -92,7 +101,7 @@ arena_stats_init(tsdn_t *tsdn, arena_stats_t *arena_stats) {
 		}
 	}
 	if (LOCKEDINT_MTX_INIT(arena_stats->mtx, "arena_stats",
-	    WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
+	        WITNESS_RANK_ARENA_STATS, malloc_mutex_rank_exclusive)) {
 		return true;
 	}
 	/* Memory is zeroed, so there is no need to clear stats. */
@ -106,8 +115,8 @@ arena_stats_large_flush_nrequests_add(tsdn_t *tsdn, arena_stats_t *arena_stats,
 	arena_stats_large_t *lstats = &arena_stats->lstats[szind - SC_NBINS];
 	locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx),
 	    &lstats->nrequests, nrequests);
-	locked_inc_u64(tsdn, LOCKEDINT_MTX(arena_stats->mtx),
-	    &lstats->nflushes, 1);
+	locked_inc_u64(
+	    tsdn, LOCKEDINT_MTX(arena_stats->mtx), &lstats->nflushes, 1);
 	LOCKEDINT_MTX_UNLOCK(tsdn, arena_stats->mtx);
 }

--- a/include/jemalloc/internal/arena_structs.h
+++ b/include/jemalloc/internal/arena_structs.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_STRUCTS_H
 #define JEMALLOC_INTERNAL_ARENA_STRUCTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/arena_stats.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bin.h"
@ -31,20 +32,20 @@ struct arena_s {
 	 *
 	 * Synchronization: atomic.
 	 */
-	atomic_u_t		nthreads[2];
+	atomic_u_t nthreads[2];

 	/* Next bin shard for binding new threads. Synchronization: atomic. */
-	atomic_u_t		binshard_next;
+	atomic_u_t binshard_next;

 	/*
 	 * When percpu_arena is enabled, to amortize the cost of reading /
 	 * updating the current CPU id, track the most recent thread accessing
 	 * this arena, and only read CPU if there is a mismatch.
 	 */
-	tsdn_t		*last_thd;
+	tsdn_t *last_thd;

 	/* Synchronization: internal. */
-	arena_stats_t		stats;
+	arena_stats_t stats;

 	/*
 	 * Lists of tcaches and cache_bin_array_descriptors for extant threads
@ -53,28 +54,28 @@ struct arena_s {
 	 *
 	 * Synchronization: tcache_ql_mtx.
 	 */
-	ql_head(tcache_slow_t)			tcache_ql;
-	ql_head(cache_bin_array_descriptor_t)	cache_bin_array_descriptor_ql;
-	malloc_mutex_t				tcache_ql_mtx;
+	ql_head(tcache_slow_t) tcache_ql;
+	ql_head(cache_bin_array_descriptor_t) cache_bin_array_descriptor_ql;
+	malloc_mutex_t tcache_ql_mtx;

 	/*
 	 * Represents a dss_prec_t, but atomically.
 	 *
 	 * Synchronization: atomic.
 	 */
-	atomic_u_t		dss_prec;
+	atomic_u_t dss_prec;

 	/*
 	 * Extant large allocations.
 	 *
 	 * Synchronization: large_mtx.
 	 */
-	edata_list_active_t	large;
+	edata_list_active_t large;
 	/* Synchronizes all large allocation/update/deallocation. */
-	malloc_mutex_t		large_mtx;
+	malloc_mutex_t large_mtx;

 	/* The page-level allocator shard this arena uses. */
-	pa_shard_t		pa_shard;
+	pa_shard_t pa_shard;

 	/*
 	 * A cached copy of base->ind.  This can get accessed on hot paths;
@ -87,15 +88,24 @@ struct arena_s {
 	 *
 	 * Synchronization: internal.
 	 */
-	base_t			*base;
+	base_t *base;
 	/* Used to determine uptime.  Read-only after initialization. */
-	nstime_t		create_time;
+	nstime_t create_time;
+
+	/* The name of the arena. */
+	char name[ARENA_NAME_LEN];

 	/*
 	 * The arena is allocated alongside its bins; really this is a
 	 * dynamically sized array determined by the binshard settings.
+	 * Enforcing cacheline-alignment to minimize the number of cachelines
+	 * touched on the hot paths.
 	 */
-	bin_t			bins[0];
+	JEMALLOC_WARN_ON_USAGE(
+	    "Do not use this field directly. "
+	    "Use `arena_get_bin` instead.")
+	JEMALLOC_ALIGNED(CACHELINE)
+	bin_t all_bins[0];
 };

 #endif /* JEMALLOC_INTERNAL_ARENA_STRUCTS_H */
--- a/include/jemalloc/internal/arena_types.h
+++ b/include/jemalloc/internal/arena_types.h
@ -1,39 +1,41 @@
 #ifndef JEMALLOC_INTERNAL_ARENA_TYPES_H
 #define JEMALLOC_INTERNAL_ARENA_TYPES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/sc.h"

 /* Default decay times in milliseconds. */
-#define DIRTY_DECAY_MS_DEFAULT	ZD(10 * 1000)
-#define MUZZY_DECAY_MS_DEFAULT	(0)
+#define DIRTY_DECAY_MS_DEFAULT ZD(10 * 1000)
+#define MUZZY_DECAY_MS_DEFAULT (0)
 /* Number of event ticks between time checks. */
-#define ARENA_DECAY_NTICKS_PER_UPDATE	1000
+#define ARENA_DECAY_NTICKS_PER_UPDATE 1000
+/* Maximum length of the arena name. */
+#define ARENA_NAME_LEN 32

-typedef struct arena_decay_s arena_decay_t;
 typedef struct arena_s arena_t;

 typedef enum {
-	percpu_arena_mode_names_base   = 0, /* Used for options processing. */
+	percpu_arena_mode_names_base = 0, /* Used for options processing. */

 	/*
 	 * *_uninit are used only during bootstrapping, and must correspond
 	 * to initialized variant plus percpu_arena_mode_enabled_base.
 	 */
-	percpu_arena_uninit            = 0,
-	per_phycpu_arena_uninit        = 1,
+	percpu_arena_uninit = 0,
+	per_phycpu_arena_uninit = 1,

 	/* All non-disabled modes must come after percpu_arena_disabled. */
-	percpu_arena_disabled          = 2,
+	percpu_arena_disabled = 2,

-	percpu_arena_mode_names_limit  = 3, /* Used for options processing. */
+	percpu_arena_mode_names_limit = 3, /* Used for options processing. */
 	percpu_arena_mode_enabled_base = 3,

-	percpu_arena                   = 3,
-	per_phycpu_arena               = 4  /* Hyper threads share arena. */
+	percpu_arena = 3,
+	per_phycpu_arena = 4 /* Hyper threads share arena. */
 } percpu_arena_mode_t;

-#define PERCPU_ARENA_ENABLED(m)	((m) >= percpu_arena_mode_enabled_base)
-#define PERCPU_ARENA_DEFAULT	percpu_arena_disabled
+#define PERCPU_ARENA_ENABLED(m) ((m) >= percpu_arena_mode_enabled_base)
+#define PERCPU_ARENA_DEFAULT percpu_arena_disabled

 /*
 * When allocation_size >= oversize_threshold, use the dedicated huge arena
--- a/include/jemalloc/internal/assert.h
+++ b/include/jemalloc/internal/assert.h
@ -1,3 +1,4 @@
+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/util.h"

@ -6,51 +7,57 @@
 * assertion failure.
 */
 #ifndef assert
-#define assert(e) do {							\
-	if (unlikely(config_debug && !(e))) {				\
-		malloc_printf(						\
-		    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n",	\
-		    __FILE__, __LINE__, #e);				\
-		abort();						\
-	}								\
-} while (0)
+#	define assert(e)                                                            \
+		do {                                                                 \
+			if (unlikely(config_debug && !(e))) {                        \
+				malloc_printf(                                       \
+				    "<jemalloc>: %s:%d: Failed assertion: \"%s\"\n", \
+				    __FILE__, __LINE__, #e);                         \
+				abort();                                             \
+			}                                                            \
+		} while (0)
 #endif

 #ifndef not_reached
-#define not_reached() do {						\
-	if (config_debug) {						\
-		malloc_printf(						\
-		    "<jemalloc>: %s:%d: Unreachable code reached\n",	\
-		    __FILE__, __LINE__);				\
-		abort();						\
-	}								\
-	unreachable();							\
-} while (0)
+#	define not_reached()                                                        \
+		do {                                                                 \
+			if (config_debug) {                                          \
+				malloc_printf(                                       \
+				    "<jemalloc>: %s:%d: Unreachable code reached\n", \
+				    __FILE__, __LINE__);                             \
+				abort();                                             \
+			}                                                            \
+			unreachable();                                               \
+		} while (0)
 #endif

 #ifndef not_implemented
-#define not_implemented() do {						\
-	if (config_debug) {						\
-		malloc_printf("<jemalloc>: %s:%d: Not implemented\n",	\
-		    __FILE__, __LINE__);				\
-		abort();						\
-	}								\
-} while (0)
+#	define not_implemented()                                              \
+		do {                                                           \
+			if (config_debug) {                                    \
+				malloc_printf(                                 \
+				    "<jemalloc>: %s:%d: Not implemented\n",    \
+				    __FILE__, __LINE__);                       \
+				abort();                                       \
+			}                                                      \
+		} while (0)
 #endif

 #ifndef assert_not_implemented
-#define assert_not_implemented(e) do {					\
-	if (unlikely(config_debug && !(e))) {				\
-		not_implemented();					\
-	}								\
-} while (0)
+#	define assert_not_implemented(e)                                      \
+		do {                                                           \
+			if (unlikely(config_debug && !(e))) {                  \
+				not_implemented();                             \
+			}                                                      \
+		} while (0)
 #endif

 /* Use to assert a particular configuration, e.g., cassert(config_debug). */
 #ifndef cassert
-#define cassert(c) do {							\
-	if (unlikely(!(c))) {						\
-		not_reached();						\
-	}								\
-} while (0)
+#	define cassert(c)                                                     \
+		do {                                                           \
+			if (unlikely(!(c))) {                                  \
+				not_reached();                                 \
+			}                                                      \
+		} while (0)
 #endif
--- a/include/jemalloc/internal/atomic.h
+++ b/include/jemalloc/internal/atomic.h
@ -1,27 +1,29 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_H

-#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+#include "jemalloc/internal/jemalloc_preamble.h"

 #define JEMALLOC_U8_ATOMICS
 #if defined(JEMALLOC_GCC_ATOMIC_ATOMICS)
-#  include "jemalloc/internal/atomic_gcc_atomic.h"
-#  if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
-#    undef JEMALLOC_U8_ATOMICS
-#  endif
+#	include "jemalloc/internal/atomic_gcc_atomic.h"
+#	if !defined(JEMALLOC_GCC_U8_ATOMIC_ATOMICS)
+#		undef JEMALLOC_U8_ATOMICS
+#	endif
 #elif defined(JEMALLOC_GCC_SYNC_ATOMICS)
-#  include "jemalloc/internal/atomic_gcc_sync.h"
-#  if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
-#    undef JEMALLOC_U8_ATOMICS
-#  endif
+#	include "jemalloc/internal/atomic_gcc_sync.h"
+#	if !defined(JEMALLOC_GCC_U8_SYNC_ATOMICS)
+#		undef JEMALLOC_U8_ATOMICS
+#	endif
 #elif defined(_MSC_VER)
-#  include "jemalloc/internal/atomic_msvc.h"
+#	include "jemalloc/internal/atomic_msvc.h"
 #elif defined(JEMALLOC_C11_ATOMICS)
-#  include "jemalloc/internal/atomic_c11.h"
+#	include "jemalloc/internal/atomic_c11.h"
 #else
-#  error "Don't have atomics implemented on this platform."
+#	error "Don't have atomics implemented on this platform."
 #endif

+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
 /*
 * This header gives more or less a backport of C11 atomics. The user can write
 * JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_sizeof_type); to generate
@ -54,22 +56,19 @@
 /*
 * Another convenience -- simple atomic helper functions.
 */
-#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type,	\
-    lg_size)								\
-    JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)		\
-    ATOMIC_INLINE void							\
-    atomic_load_add_store_##short_type(atomic_##short_type##_t *a,	\
-	type inc) {							\
-	    type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);	\
-	    type newval = oldval + inc;					\
-	    atomic_store_##short_type(a, newval, ATOMIC_RELAXED);	\
-	}								\
-    ATOMIC_INLINE void							\
-    atomic_load_sub_store_##short_type(atomic_##short_type##_t *a,	\
-	type inc) {							\
-	    type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);	\
-	    type newval = oldval - inc;					\
-	    atomic_store_##short_type(a, newval, ATOMIC_RELAXED);	\
+#define JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(type, short_type, lg_size)      \
+	JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)               \
+	ATOMIC_INLINE void atomic_load_add_store_##short_type(                 \
+	    atomic_##short_type##_t *a, type inc) {                            \
+		type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);     \
+		type newval = oldval + inc;                                    \
+		atomic_store_##short_type(a, newval, ATOMIC_RELAXED);          \
+	}                                                                      \
+	ATOMIC_INLINE void atomic_load_sub_store_##short_type(                 \
+	    atomic_##short_type##_t *a, type inc) {                            \
+		type oldval = atomic_load_##short_type(a, ATOMIC_RELAXED);     \
+		type newval = oldval - inc;                                    \
+		atomic_store_##short_type(a, newval, ATOMIC_RELAXED);          \
 	}

 /*
@ -77,7 +76,7 @@
 * fact.
 */
 #if (LG_SIZEOF_PTR == 3 || LG_SIZEOF_INT == 3)
-#  define JEMALLOC_ATOMIC_U64
+#	define JEMALLOC_ATOMIC_U64
 #endif

 JEMALLOC_GENERATE_ATOMICS(void *, p, LG_SIZEOF_PTR)
@ -90,6 +89,8 @@ JEMALLOC_GENERATE_ATOMICS(bool, b, 0)

 JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(unsigned, u, LG_SIZEOF_INT)

+JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(int, i, LG_SIZEOF_INT)
+
 JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(size_t, zu, LG_SIZEOF_PTR)

 JEMALLOC_GENERATE_EXPANDED_INT_ATOMICS(ssize_t, zd, LG_SIZEOF_PTR)
--- a/include/jemalloc/internal/atomic_c11.h
+++ b/include/jemalloc/internal/atomic_c11.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_C11_H
 #define JEMALLOC_INTERNAL_ATOMIC_C11_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include <stdatomic.h>

 #define ATOMIC_INIT(...) ATOMIC_VAR_INIT(__VA_ARGS__)
@ -14,6 +15,7 @@

 #define atomic_fence atomic_thread_fence

+/* clang-format off */
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 typedef _Atomic(type) atomic_##short_type##_t;				\
@ -58,40 +60,35 @@ atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
 	return atomic_compare_exchange_strong_explicit(a, expected,	\
 	    desired, success_mo, failure_mo);				\
 }
+/* clang-format on */

 /*
 * Integral types have some special operations available that non-integral ones
 * lack.
 */
-#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, 		\
-    /* unused */ lg_size)						\
-JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_add_##short_type(atomic_##short_type##_t *a,		\
-    type val, atomic_memory_order_t mo) {				\
-	return atomic_fetch_add_explicit(a, val, mo);			\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_sub_##short_type(atomic_##short_type##_t *a,		\
-    type val, atomic_memory_order_t mo) {				\
-	return atomic_fetch_sub_explicit(a, val, mo);			\
-}									\
-ATOMIC_INLINE type							\
-atomic_fetch_and_##short_type(atomic_##short_type##_t *a,		\
-    type val, atomic_memory_order_t mo) {				\
-	return atomic_fetch_and_explicit(a, val, mo);			\
-}									\
-ATOMIC_INLINE type							\
-atomic_fetch_or_##short_type(atomic_##short_type##_t *a,		\
-    type val, atomic_memory_order_t mo) {				\
-	return atomic_fetch_or_explicit(a, val, mo);			\
-}									\
-ATOMIC_INLINE type							\
-atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
-    type val, atomic_memory_order_t mo) {				\
-	return atomic_fetch_xor_explicit(a, val, mo);			\
-}
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_add_explicit(a, val, mo);                  \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_sub_explicit(a, val, mo);                  \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_and_explicit(a, val, mo);                  \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_or_explicit(a, val, mo);                   \
+	}                                                                      \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return atomic_fetch_xor_explicit(a, val, mo);                  \
+	}

 #endif /* JEMALLOC_INTERNAL_ATOMIC_C11_H */
--- a/include/jemalloc/internal/atomic_gcc_atomic.h
+++ b/include/jemalloc/internal/atomic_gcc_atomic.h
@ -1,9 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

-#define ATOMIC_INIT(...) {__VA_ARGS__}
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }

 typedef enum {
 	atomic_memory_order_relaxed,
@ -36,94 +40,82 @@ atomic_fence(atomic_memory_order_t mo) {
 	__atomic_thread_fence(atomic_enum_to_builtin(mo));
 }

-#define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
-    /* unused */ lg_size)						\
-typedef struct {							\
-	type repr;							\
-} atomic_##short_type##_t;						\
-									\
-ATOMIC_INLINE type							\
-atomic_load_##short_type(const atomic_##short_type##_t *a,		\
-    atomic_memory_order_t mo) {						\
-	type result;							\
-	__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));	\
-	return result;							\
-}									\
-									\
-ATOMIC_INLINE void							\
-atomic_store_##short_type(atomic_##short_type##_t *a, type val,		\
-    atomic_memory_order_t mo) {						\
-	__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));	\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	type result;							\
-	__atomic_exchange(&a->repr, &val, &result,			\
-	    atomic_enum_to_builtin(mo));				\
-	return result;							\
-}									\
-									\
-ATOMIC_INLINE bool							\
-atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    UNUSED type *expected, type desired,				\
-    atomic_memory_order_t success_mo,					\
-    atomic_memory_order_t failure_mo) {					\
-	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
-	    true, atomic_enum_to_builtin(success_mo),			\
-	    atomic_enum_to_builtin(failure_mo));			\
-}									\
-									\
-ATOMIC_INLINE bool							\
-atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    UNUSED type *expected, type desired,				\
-    atomic_memory_order_t success_mo,					\
-    atomic_memory_order_t failure_mo) {					\
-	return __atomic_compare_exchange(&a->repr, expected, &desired,	\
-	    false,							\
-	    atomic_enum_to_builtin(success_mo),				\
-	    atomic_enum_to_builtin(failure_mo));			\
-}
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+	typedef struct {                                                       \
+		type repr;                                                     \
+	} atomic_##short_type##_t;                                             \
+                                                                               \
+	ATOMIC_INLINE type atomic_load_##short_type(                           \
+	    const atomic_##short_type##_t *a, atomic_memory_order_t mo) {      \
+		type result;                                                   \
+		__atomic_load(&a->repr, &result, atomic_enum_to_builtin(mo));  \
+		return result;                                                 \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE void atomic_store_##short_type(                          \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		__atomic_store(&a->repr, &val, atomic_enum_to_builtin(mo));    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_exchange_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		type result;                                                   \
+		__atomic_exchange(                                             \
+		    &a->repr, &val, &result, atomic_enum_to_builtin(mo));      \
+		return result;                                                 \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_weak_##short_type(          \
+	    atomic_##short_type##_t *a, UNUSED type *expected, type desired,   \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		return __atomic_compare_exchange(&a->repr, expected, &desired, \
+		    true, atomic_enum_to_builtin(success_mo),                  \
+		    atomic_enum_to_builtin(failure_mo));                       \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_strong_##short_type(        \
+	    atomic_##short_type##_t *a, UNUSED type *expected, type desired,   \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		return __atomic_compare_exchange(&a->repr, expected, &desired, \
+		    false, atomic_enum_to_builtin(success_mo),                 \
+		    atomic_enum_to_builtin(failure_mo));                       \
+	}

+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_add(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_sub(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_and(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_or(                                      \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __atomic_fetch_xor(                                     \
+		    &a->repr, val, atomic_enum_to_builtin(mo));                \
+	}

-#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
-    /* unused */ lg_size)						\
-JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __atomic_fetch_add(&a->repr, val,			\
-	    atomic_enum_to_builtin(mo));				\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __atomic_fetch_sub(&a->repr, val,			\
-	    atomic_enum_to_builtin(mo));				\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __atomic_fetch_and(&a->repr, val,			\
-	    atomic_enum_to_builtin(mo));				\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __atomic_fetch_or(&a->repr, val,				\
-	    atomic_enum_to_builtin(mo));				\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __atomic_fetch_xor(&a->repr, val,			\
-	    atomic_enum_to_builtin(mo));				\
-}
+#undef ATOMIC_INLINE

 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_ATOMIC_H */
--- a/include/jemalloc/internal/atomic_gcc_sync.h
+++ b/include/jemalloc/internal/atomic_gcc_sync.h
@ -1,7 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H
 #define JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H

-#define ATOMIC_INIT(...) {__VA_ARGS__}
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }

 typedef enum {
 	atomic_memory_order_relaxed,
@ -25,13 +30,13 @@ atomic_fence(atomic_memory_order_t mo) {
 		return;
 	}
 	asm volatile("" ::: "memory");
-#  if defined(__i386__) || defined(__x86_64__)
+#if defined(__i386__) || defined(__x86_64__)
 	/* This is implicit on x86. */
-#  elif defined(__ppc64__)
+#elif defined(__ppc64__)
 	asm volatile("lwsync");
-#  elif defined(__ppc__)
+#elif defined(__ppc__)
 	asm volatile("sync");
-#  elif defined(__sparc__) && defined(__arch64__)
+#elif defined(__sparc__) && defined(__arch64__)
 	if (mo == atomic_memory_order_acquire) {
 		asm volatile("membar #LoadLoad | #LoadStore");
 	} else if (mo == atomic_memory_order_release) {
@ -39,9 +44,9 @@ atomic_fence(atomic_memory_order_t mo) {
 	} else {
 		asm volatile("membar #LoadLoad | #LoadStore | #StoreStore");
 	}
-#  else
+#else
 	__sync_synchronize();
-#  endif
+#endif
 	asm volatile("" ::: "memory");
 }

@ -64,25 +69,25 @@ atomic_fence(atomic_memory_order_t mo) {

 ATOMIC_INLINE void
 atomic_pre_sc_load_fence() {
-#  if defined(__i386__) || defined(__x86_64__) ||			\
-    (defined(__sparc__) && defined(__arch64__))
+#if defined(__i386__) || defined(__x86_64__)                                   \
+    || (defined(__sparc__) && defined(__arch64__))
 	atomic_fence(atomic_memory_order_relaxed);
-#  else
+#else
 	atomic_fence(atomic_memory_order_seq_cst);
-#  endif
+#endif
 }

 ATOMIC_INLINE void
 atomic_post_sc_store_fence() {
-#  if defined(__i386__) || defined(__x86_64__) ||			\
-    (defined(__sparc__) && defined(__arch64__))
+#if defined(__i386__) || defined(__x86_64__)                                   \
+    || (defined(__sparc__) && defined(__arch64__))
 	atomic_fence(atomic_memory_order_seq_cst);
-#  else
+#else
 	atomic_fence(atomic_memory_order_relaxed);
-#  endif
-
+#endif
 }

+/* clang-format off */
 #define JEMALLOC_GENERATE_ATOMICS(type, short_type,			\
    /* unused */ lg_size)						\
 typedef struct {							\
@ -157,39 +162,36 @@ atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
 		return false;						\
 	}								\
 }
+/* clang-format on */

-#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type,			\
-    /* unused */ lg_size)						\
-JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)	\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_add_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __sync_fetch_and_add(&a->repr, val);			\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_sub_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __sync_fetch_and_sub(&a->repr, val);			\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_and_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __sync_fetch_and_and(&a->repr, val);			\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_or_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __sync_fetch_and_or(&a->repr, val);			\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_fetch_xor_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return __sync_fetch_and_xor(&a->repr, val);			\
-}
+#define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, /* unused */ lg_size)  \
+	JEMALLOC_GENERATE_ATOMICS(type, short_type, /* unused */ lg_size)      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_add_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_add(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_sub_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_sub(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_and_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_and(&a->repr, val);                    \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_or_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_or(&a->repr, val);                     \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_fetch_xor_##short_type(                      \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return __sync_fetch_and_xor(&a->repr, val);                    \
+	}
+
+#undef ATOMIC_INLINE

 #endif /* JEMALLOC_INTERNAL_ATOMIC_GCC_SYNC_H */
--- a/include/jemalloc/internal/atomic_msvc.h
+++ b/include/jemalloc/internal/atomic_msvc.h
@ -1,7 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_ATOMIC_MSVC_H
 #define JEMALLOC_INTERNAL_ATOMIC_MSVC_H

-#define ATOMIC_INIT(...) {__VA_ARGS__}
+#include "jemalloc/internal/jemalloc_preamble.h"
+
+#define ATOMIC_INLINE JEMALLOC_ALWAYS_INLINE
+
+#define ATOMIC_INIT(...)                                                       \
+	{ __VA_ARGS__ }

 typedef enum {
 	atomic_memory_order_relaxed,
@ -11,109 +16,106 @@ typedef enum {
 	atomic_memory_order_seq_cst
 } atomic_memory_order_t;

-typedef char atomic_repr_0_t;
-typedef short atomic_repr_1_t;
-typedef long atomic_repr_2_t;
+typedef char    atomic_repr_0_t;
+typedef short   atomic_repr_1_t;
+typedef long    atomic_repr_2_t;
 typedef __int64 atomic_repr_3_t;

 ATOMIC_INLINE void
 atomic_fence(atomic_memory_order_t mo) {
 	_ReadWriteBarrier();
-#  if defined(_M_ARM) || defined(_M_ARM64)
+#if defined(_M_ARM) || defined(_M_ARM64)
 	/* ARM needs a barrier for everything but relaxed. */
 	if (mo != atomic_memory_order_relaxed) {
 		MemoryBarrier();
 	}
-#  elif defined(_M_IX86) || defined (_M_X64)
+#elif defined(_M_IX86) || defined(_M_X64)
 	/* x86 needs a barrier only for seq_cst. */
 	if (mo == atomic_memory_order_seq_cst) {
 		MemoryBarrier();
 	}
-#  else
-#  error "Don't know how to create atomics for this platform for MSVC."
-#  endif
+#else
+#	error "Don't know how to create atomics for this platform for MSVC."
+#endif
 	_ReadWriteBarrier();
 }

-#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_ ## lg_size ## _t
+#define ATOMIC_INTERLOCKED_REPR(lg_size) atomic_repr_##lg_size##_t

 #define ATOMIC_CONCAT(a, b) ATOMIC_RAW_CONCAT(a, b)
-#define ATOMIC_RAW_CONCAT(a, b) a ## b
+#define ATOMIC_RAW_CONCAT(a, b) a##b

-#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size) ATOMIC_CONCAT(	\
-    base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))
+#define ATOMIC_INTERLOCKED_NAME(base_name, lg_size)                            \
+	ATOMIC_CONCAT(base_name, ATOMIC_INTERLOCKED_SUFFIX(lg_size))

-#define ATOMIC_INTERLOCKED_SUFFIX(lg_size)				\
-    ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)
+#define ATOMIC_INTERLOCKED_SUFFIX(lg_size)                                     \
+	ATOMIC_CONCAT(ATOMIC_INTERLOCKED_SUFFIX_, lg_size)

 #define ATOMIC_INTERLOCKED_SUFFIX_0 8
 #define ATOMIC_INTERLOCKED_SUFFIX_1 16
 #define ATOMIC_INTERLOCKED_SUFFIX_2
 #define ATOMIC_INTERLOCKED_SUFFIX_3 64

-#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)		\
-typedef struct {							\
-	ATOMIC_INTERLOCKED_REPR(lg_size) repr;				\
-} atomic_##short_type##_t;						\
-									\
-ATOMIC_INLINE type							\
-atomic_load_##short_type(const atomic_##short_type##_t *a,		\
-    atomic_memory_order_t mo) {						\
-	ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;			\
-	if (mo != atomic_memory_order_relaxed) {			\
-		atomic_fence(atomic_memory_order_acquire);		\
-	}								\
-	return (type) ret;						\
-}									\
-									\
-ATOMIC_INLINE void							\
-atomic_store_##short_type(atomic_##short_type##_t *a,			\
-    type val, atomic_memory_order_t mo) {				\
-	if (mo != atomic_memory_order_relaxed) {			\
-		atomic_fence(atomic_memory_order_release);		\
-	}								\
-	a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size)) val;		\
-	if (mo == atomic_memory_order_seq_cst) {			\
-		atomic_fence(atomic_memory_order_seq_cst);		\
-	}								\
-}									\
-									\
-ATOMIC_INLINE type							\
-atomic_exchange_##short_type(atomic_##short_type##_t *a, type val,	\
-    atomic_memory_order_t mo) {						\
-	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,	\
-	    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);	\
-}									\
-									\
-ATOMIC_INLINE bool							\
-atomic_compare_exchange_weak_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
-	ATOMIC_INTERLOCKED_REPR(lg_size) e =				\
-	    (ATOMIC_INTERLOCKED_REPR(lg_size))*expected;		\
-	ATOMIC_INTERLOCKED_REPR(lg_size) d =				\
-	    (ATOMIC_INTERLOCKED_REPR(lg_size))desired;			\
-	ATOMIC_INTERLOCKED_REPR(lg_size) old =				\
-	    ATOMIC_INTERLOCKED_NAME(_InterlockedCompareExchange, 	\
-		lg_size)(&a->repr, d, e);				\
-	if (old == e) {							\
-		return true;						\
-	} else {							\
-		*expected = (type)old;					\
-		return false;						\
-	}								\
-}									\
-									\
-ATOMIC_INLINE bool							\
-atomic_compare_exchange_strong_##short_type(atomic_##short_type##_t *a,	\
-    type *expected, type desired, atomic_memory_order_t success_mo,	\
-    atomic_memory_order_t failure_mo) {					\
-	/* We implement the weak version with strong semantics. */	\
-	return atomic_compare_exchange_weak_##short_type(a, expected,	\
-	    desired, success_mo, failure_mo);				\
-}
-
+#define JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)                   \
+	typedef struct {                                                       \
+		ATOMIC_INTERLOCKED_REPR(lg_size) repr;                         \
+	} atomic_##short_type##_t;                                             \
+                                                                               \
+	ATOMIC_INLINE type atomic_load_##short_type(                           \
+	    const atomic_##short_type##_t *a, atomic_memory_order_t mo) {      \
+		ATOMIC_INTERLOCKED_REPR(lg_size) ret = a->repr;                \
+		if (mo != atomic_memory_order_relaxed) {                       \
+			atomic_fence(atomic_memory_order_acquire);             \
+		}                                                              \
+		return (type)ret;                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE void atomic_store_##short_type(                          \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		if (mo != atomic_memory_order_relaxed) {                       \
+			atomic_fence(atomic_memory_order_release);             \
+		}                                                              \
+		a->repr = (ATOMIC_INTERLOCKED_REPR(lg_size))val;               \
+		if (mo == atomic_memory_order_seq_cst) {                       \
+			atomic_fence(atomic_memory_order_seq_cst);             \
+		}                                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE type atomic_exchange_##short_type(                       \
+	    atomic_##short_type##_t *a, type val, atomic_memory_order_t mo) {  \
+		return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedExchange,     \
+		    lg_size)(&a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val); \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_weak_##short_type(          \
+	    atomic_##short_type##_t *a, type *expected, type desired,          \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		e = (ATOMIC_INTERLOCKED_REPR(lg_size)) * expected;             \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		d = (ATOMIC_INTERLOCKED_REPR(lg_size))desired;                 \
+		ATOMIC_INTERLOCKED_REPR(lg_size)                               \
+		old = ATOMIC_INTERLOCKED_NAME(                                 \
+		    _InterlockedCompareExchange, lg_size)(&a->repr, d, e);     \
+		if (old == e) {                                                \
+			return true;                                           \
+		} else {                                                       \
+			*expected = (type)old;                                 \
+			return false;                                          \
+		}                                                              \
+	}                                                                      \
+                                                                               \
+	ATOMIC_INLINE bool atomic_compare_exchange_strong_##short_type(        \
+	    atomic_##short_type##_t *a, type *expected, type desired,          \
+	    atomic_memory_order_t success_mo,                                  \
+	    atomic_memory_order_t failure_mo) {                                \
+		/* We implement the weak version with strong semantics. */     \
+		return atomic_compare_exchange_weak_##short_type(              \
+		    a, expected, desired, success_mo, failure_mo);             \
+	}

+/* clang-format off */
 #define JEMALLOC_GENERATE_INT_ATOMICS(type, short_type, lg_size)	\
 JEMALLOC_GENERATE_ATOMICS(type, short_type, lg_size)			\
 									\
@ -154,5 +156,8 @@ atomic_fetch_xor_##short_type(atomic_##short_type##_t *a,		\
 	return (type)ATOMIC_INTERLOCKED_NAME(_InterlockedXor, lg_size)(	\
 	    &a->repr, (ATOMIC_INTERLOCKED_REPR(lg_size))val);		\
 }
+/* clang-format on */
+
+#undef ATOMIC_INLINE

 #endif /* JEMALLOC_INTERNAL_ATOMIC_MSVC_H */
--- a/include/jemalloc/internal/background_thread_externs.h
+++ b/include/jemalloc/internal/background_thread_externs.h
@ -1,26 +1,31 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_EXTERNS_H

-extern bool opt_background_thread;
-extern size_t opt_max_background_threads;
-extern malloc_mutex_t background_thread_lock;
-extern atomic_b_t background_thread_enabled_state;
-extern size_t n_background_threads;
-extern size_t max_background_threads;
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/mutex.h"
+
+extern bool                      opt_background_thread;
+extern size_t                    opt_max_background_threads;
+extern malloc_mutex_t            background_thread_lock;
+extern atomic_b_t                background_thread_enabled_state;
+extern size_t                    n_background_threads;
+extern size_t                    max_background_threads;
 extern background_thread_info_t *background_thread_info;

 bool background_thread_create(tsd_t *tsd, unsigned arena_ind);
 bool background_threads_enable(tsd_t *tsd);
 bool background_threads_disable(tsd_t *tsd);
-bool background_thread_is_started(background_thread_info_t* info);
-void background_thread_wakeup_early(background_thread_info_t *info,
-    nstime_t *remaining_sleep);
+bool background_thread_is_started(background_thread_info_t *info);
+void background_thread_wakeup_early(
+    background_thread_info_t *info, nstime_t *remaining_sleep);
 void background_thread_prefork0(tsdn_t *tsdn);
 void background_thread_prefork1(tsdn_t *tsdn);
 void background_thread_postfork_parent(tsdn_t *tsdn);
 void background_thread_postfork_child(tsdn_t *tsdn);
-bool background_thread_stats_read(tsdn_t *tsdn,
-    background_thread_stats_t *stats);
+bool background_thread_stats_read(
+    tsdn_t *tsdn, background_thread_stats_t *stats);
 void background_thread_ctl_init(tsdn_t *tsdn);

 #ifdef JEMALLOC_PTHREAD_CREATE_WRAPPER
--- a/include/jemalloc/internal/background_thread_inlines.h
+++ b/include/jemalloc/internal/background_thread_inlines.h
@ -1,15 +1,25 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_INLINES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/background_thread_externs.h"
+
 JEMALLOC_ALWAYS_INLINE bool
 background_thread_enabled(void) {
 	return atomic_load_b(&background_thread_enabled_state, ATOMIC_RELAXED);
 }

+JEMALLOC_ALWAYS_INLINE void
+background_thread_enabled_set_impl(bool state) {
+	atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED);
+}
+
 JEMALLOC_ALWAYS_INLINE void
 background_thread_enabled_set(tsdn_t *tsdn, bool state) {
 	malloc_mutex_assert_owner(tsdn, &background_thread_lock);
-	atomic_store_b(&background_thread_enabled_state, state, ATOMIC_RELAXED);
+	background_thread_enabled_set_impl(state);
 }

 JEMALLOC_ALWAYS_INLINE background_thread_info_t *
@ -26,14 +36,14 @@ background_thread_info_get(size_t ind) {
 JEMALLOC_ALWAYS_INLINE uint64_t
 background_thread_wakeup_time_get(background_thread_info_t *info) {
 	uint64_t next_wakeup = nstime_ns(&info->next_wakeup);
-	assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE) ==
-	    (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP));
+	assert(atomic_load_b(&info->indefinite_sleep, ATOMIC_ACQUIRE)
+	    == (next_wakeup == BACKGROUND_THREAD_INDEFINITE_SLEEP));
 	return next_wakeup;
 }

 JEMALLOC_ALWAYS_INLINE void
-background_thread_wakeup_time_set(tsdn_t *tsdn, background_thread_info_t *info,
-    uint64_t wakeup_time) {
+background_thread_wakeup_time_set(
+    tsdn_t *tsdn, background_thread_info_t *info, uint64_t wakeup_time) {
 	malloc_mutex_assert_owner(tsdn, &info->mtx);
 	atomic_store_b(&info->indefinite_sleep,
 	    wakeup_time == BACKGROUND_THREAD_INDEFINITE_SLEEP, ATOMIC_RELEASE);
--- a/include/jemalloc/internal/background_thread_structs.h
+++ b/include/jemalloc/internal/background_thread_structs.h
@ -1,10 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H
 #define JEMALLOC_INTERNAL_BACKGROUND_THREAD_STRUCTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
+
 /* This file really combines "structs" and "types", but only transitionally. */

 #if defined(JEMALLOC_BACKGROUND_THREAD) || defined(JEMALLOC_LAZY_LOCK)
-#  define JEMALLOC_PTHREAD_CREATE_WRAPPER
+#	define JEMALLOC_PTHREAD_CREATE_WRAPPER
 #endif

 #define BACKGROUND_THREAD_INDEFINITE_SLEEP UINT64_MAX
@ -32,33 +35,33 @@ typedef enum {
 struct background_thread_info_s {
 #ifdef JEMALLOC_BACKGROUND_THREAD
 	/* Background thread is pthread specific. */
-	pthread_t		thread;
-	pthread_cond_t		cond;
+	pthread_t      thread;
+	pthread_cond_t cond;
 #endif
-	malloc_mutex_t		mtx;
-	background_thread_state_t	state;
+	malloc_mutex_t            mtx;
+	background_thread_state_t state;
 	/* When true, it means no wakeup scheduled. */
-	atomic_b_t		indefinite_sleep;
+	atomic_b_t indefinite_sleep;
 	/* Next scheduled wakeup time (absolute time in ns). */
-	nstime_t		next_wakeup;
+	nstime_t next_wakeup;
 	/*
 	 *  Since the last background thread run, newly added number of pages
 	 *  that need to be purged by the next wakeup.  This is adjusted on
 	 *  epoch advance, and is used to determine whether we should signal the
 	 *  background thread to wake up earlier.
 	 */
-	size_t			npages_to_purge_new;
+	size_t npages_to_purge_new;
 	/* Stats: total number of runs since started. */
-	uint64_t		tot_n_runs;
+	uint64_t tot_n_runs;
 	/* Stats: total sleep time since started. */
-	nstime_t		tot_sleep_time;
+	nstime_t tot_sleep_time;
 };
 typedef struct background_thread_info_s background_thread_info_t;

 struct background_thread_stats_s {
-	size_t num_threads;
-	uint64_t num_runs;
-	nstime_t run_interval;
+	size_t            num_threads;
+	uint64_t          num_runs;
+	nstime_t          run_interval;
 	mutex_prof_data_t max_counter_per_bg_thd;
 };
 typedef struct background_thread_stats_s background_thread_stats_t;
--- a/include/jemalloc/internal/base.h
+++ b/include/jemalloc/internal/base.h
@ -1,12 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_BASE_H
 #define JEMALLOC_INTERNAL_BASE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/ehooks.h"
 #include "jemalloc/internal/mutex.h"

+/*
+ * Alignment when THP is not enabled.  Set to constant 2M in case the HUGEPAGE
+ * value is unexpected high (which would cause VM over-reservation).
+ */
+#define BASE_BLOCK_MIN_ALIGN ((size_t)2 << 20)
+
 enum metadata_thp_mode_e {
-	metadata_thp_disabled   = 0,
+	metadata_thp_disabled = 0,
 	/*
 	 * Lazily enable hugepage for metadata. To avoid high RSS caused by THP
 	 * + low usage arena (i.e. THP becomes a significant percentage), the
@ -15,16 +22,15 @@ enum metadata_thp_mode_e {
 	 * arena), "auto" behaves the same as "always", i.e. madvise hugepage
 	 * right away.
 	 */
-	metadata_thp_auto       = 1,
-	metadata_thp_always     = 2,
+	metadata_thp_auto = 1,
+	metadata_thp_always = 2,
 	metadata_thp_mode_limit = 3
 };
 typedef enum metadata_thp_mode_e metadata_thp_mode_t;

 #define METADATA_THP_DEFAULT metadata_thp_disabled
 extern metadata_thp_mode_t opt_metadata_thp;
-extern const char *metadata_thp_mode_names[];
-
+extern const char *const   metadata_thp_mode_names[];

 /* Embedded at the beginning of every block of base-managed virtual memory. */
 typedef struct base_block_s base_block_t;
@ -72,8 +78,13 @@ struct base_s {
 	/* Heap of extents that track unused trailing space within blocks. */
 	edata_heap_t avail[SC_NSIZES];

+	/* Contains reusable base edata (used by tcache_stacks currently). */
+	edata_avail_t edata_avail;
+
 	/* Stats, only maintained if config_stats. */
 	size_t allocated;
+	size_t edata_allocated;
+	size_t rtree_allocated;
 	size_t resident;
 	size_t mapped;
 	/* Number of THP regions touched. */
@ -91,20 +102,24 @@ metadata_thp_enabled(void) {
 }

 base_t *b0get(void);
-base_t *base_new(tsdn_t *tsdn, unsigned ind,
-    const extent_hooks_t *extent_hooks, bool metadata_use_hooks);
-void base_delete(tsdn_t *tsdn, base_t *base);
-ehooks_t *base_ehooks_get(base_t *base);
-ehooks_t *base_ehooks_get_for_metadata(base_t *base);
-extent_hooks_t *base_extent_hooks_set(base_t *base,
-    extent_hooks_t *extent_hooks);
-void *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
+base_t *base_new(tsdn_t *tsdn, unsigned ind, const extent_hooks_t *extent_hooks,
+    bool metadata_use_hooks);
+void    base_delete(tsdn_t *tsdn, base_t *base);
+ehooks_t       *base_ehooks_get(base_t *base);
+ehooks_t       *base_ehooks_get_for_metadata(base_t *base);
+extent_hooks_t *base_extent_hooks_set(
+    base_t *base, extent_hooks_t *extent_hooks);
+void    *base_alloc(tsdn_t *tsdn, base_t *base, size_t size, size_t alignment);
 edata_t *base_alloc_edata(tsdn_t *tsdn, base_t *base);
-void base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
-    size_t *resident, size_t *mapped, size_t *n_thp);
-void base_prefork(tsdn_t *tsdn, base_t *base);
-void base_postfork_parent(tsdn_t *tsdn, base_t *base);
-void base_postfork_child(tsdn_t *tsdn, base_t *base);
-bool base_boot(tsdn_t *tsdn);
+void    *base_alloc_rtree(tsdn_t *tsdn, base_t *base, size_t size);
+void    *b0_alloc_tcache_stack(tsdn_t *tsdn, size_t size);
+void     b0_dalloc_tcache_stack(tsdn_t *tsdn, void *tcache_stack);
+void     base_stats_get(tsdn_t *tsdn, base_t *base, size_t *allocated,
+        size_t *edata_allocated, size_t *rtree_allocated, size_t *resident,
+        size_t *mapped, size_t *n_thp);
+void     base_prefork(tsdn_t *tsdn, base_t *base);
+void     base_postfork_parent(tsdn_t *tsdn, base_t *base);
+void     base_postfork_child(tsdn_t *tsdn, base_t *base);
+bool     base_boot(tsdn_t *tsdn);

 #endif /* JEMALLOC_INTERNAL_BASE_H */
--- a/include/jemalloc/internal/bin.h
+++ b/include/jemalloc/internal/bin.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_BIN_H
 #define JEMALLOC_INTERNAL_BIN_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bin_info.h"
 #include "jemalloc/internal/bin_stats.h"
 #include "jemalloc/internal/bin_types.h"
 #include "jemalloc/internal/edata.h"
@ -14,13 +16,13 @@
 typedef struct bin_s bin_t;
 struct bin_s {
 	/* All operations on bin_t fields require lock ownership. */
-	malloc_mutex_t		lock;
+	malloc_mutex_t lock;

 	/*
 	 * Bin statistics.  These get touched every time the lock is acquired,
 	 * so put them close by in the hopes of getting some cache locality.
 	 */
-	bin_stats_t	stats;
+	bin_stats_t stats;

 	/*
 	 * Current slab being used to service allocations of this bin's size
@ -28,17 +30,17 @@ struct bin_s {
 	 * slabcur is reassigned, the previous slab must be deallocated or
 	 * inserted into slabs_{nonfull,full}.
 	 */
-	edata_t			*slabcur;
+	edata_t *slabcur;

 	/*
 	 * Heap of non-full slabs.  This heap is used to assure that new
 	 * allocations come from the non-full slab that is oldest/lowest in
 	 * memory.
 	 */
-	edata_heap_t		slabs_nonfull;
+	edata_heap_t slabs_nonfull;

 	/* List used to track full slabs. */
-	edata_list_active_t	slabs_full;
+	edata_list_active_t slabs_full;
 };

 /* A set of sharded bins of the same size class. */
@ -48,7 +50,7 @@ struct bins_s {
 	bin_t *bin_shards;
 };

-void bin_shard_sizes_boot(unsigned bin_shards[SC_NBINS]);
+void bin_shard_sizes_boot(unsigned bin_shard_sizes[SC_NBINS]);
 bool bin_update_shard_size(unsigned bin_shards[SC_NBINS], size_t start_size,
    size_t end_size, size_t nshards);

@ -60,6 +62,43 @@ void bin_prefork(tsdn_t *tsdn, bin_t *bin);
 void bin_postfork_parent(tsdn_t *tsdn, bin_t *bin);
 void bin_postfork_child(tsdn_t *tsdn, bin_t *bin);

+/* Slab region allocation. */
+void *bin_slab_reg_alloc(edata_t *slab, const bin_info_t *bin_info);
+void  bin_slab_reg_alloc_batch(
+     edata_t *slab, const bin_info_t *bin_info, unsigned cnt, void **ptrs);
+
+/* Slab list management. */
+void     bin_slabs_nonfull_insert(bin_t *bin, edata_t *slab);
+void     bin_slabs_nonfull_remove(bin_t *bin, edata_t *slab);
+edata_t *bin_slabs_nonfull_tryget(bin_t *bin);
+void     bin_slabs_full_insert(bool is_auto, bin_t *bin, edata_t *slab);
+void     bin_slabs_full_remove(bool is_auto, bin_t *bin, edata_t *slab);
+
+/* Slab association / demotion. */
+void bin_dissociate_slab(bool is_auto, edata_t *slab, bin_t *bin);
+void bin_lower_slab(tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+
+/* Deallocation helpers (called under bin lock). */
+void bin_dalloc_slab_prepare(tsdn_t *tsdn, edata_t *slab, bin_t *bin);
+void bin_dalloc_locked_handle_newly_empty(
+    tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+void bin_dalloc_locked_handle_newly_nonempty(
+    tsdn_t *tsdn, bool is_auto, edata_t *slab, bin_t *bin);
+
+/* Slabcur refill and allocation. */
+void  bin_refill_slabcur_with_fresh_slab(tsdn_t *tsdn, bin_t *bin,
+    szind_t binind, edata_t *fresh_slab);
+void *bin_malloc_with_fresh_slab(tsdn_t *tsdn, bin_t *bin,
+    szind_t binind, edata_t *fresh_slab);
+bool  bin_refill_slabcur_no_fresh_slab(tsdn_t *tsdn, bool is_auto,
+    bin_t *bin);
+void *bin_malloc_no_fresh_slab(tsdn_t *tsdn, bool is_auto, bin_t *bin,
+    szind_t binind);
+
+/* Bin selection. */
+bin_t *bin_choose(tsdn_t *tsdn, arena_t *arena, szind_t binind,
+    unsigned *binshard_p);
+
 /* Stats. */
 static inline void
 bin_stats_merge(tsdn_t *tsdn, bin_stats_data_t *dst_bin_stats, bin_t *bin) {
--- a/include/jemalloc/internal/bin_info.h
+++ b/include/jemalloc/internal/bin_info.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_INFO_H
 #define JEMALLOC_INTERNAL_BIN_INFO_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bitmap.h"

 /*
@ -25,22 +26,22 @@
 typedef struct bin_info_s bin_info_t;
 struct bin_info_s {
 	/* Size of regions in a slab for this bin's size class. */
-	size_t			reg_size;
+	size_t reg_size;

 	/* Total size of a slab for this bin's size class. */
-	size_t			slab_size;
+	size_t slab_size;

 	/* Total number of regions in a slab for this bin's size class. */
-	uint32_t		nregs;
+	uint32_t nregs;

 	/* Number of sharded bins in each arena for this size class. */
-	uint32_t		n_shards;
+	uint32_t n_shards;

 	/*
 	 * Metadata used to manipulate bitmaps for slabs associated with this
 	 * bin.
 	 */
-	bitmap_info_t		bitmap_info;
+	bitmap_info_t bitmap_info;
 };

 extern bin_info_t bin_infos[SC_NBINS];
--- a/include/jemalloc/internal/bin_inlines.h
+++ b/include/jemalloc/internal/bin_inlines.h
@ -0,0 +1,112 @@
+#ifndef JEMALLOC_INTERNAL_BIN_INLINES_H
+#define JEMALLOC_INTERNAL_BIN_INLINES_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/bin.h"
+#include "jemalloc/internal/bin_info.h"
+#include "jemalloc/internal/bitmap.h"
+#include "jemalloc/internal/div.h"
+#include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/sc.h"
+
+/*
+ * The dalloc bin info contains just the information that the common paths need
+ * during tcache flushes.  By force-inlining these paths, and using local copies
+ * of data (so that the compiler knows it's constant), we avoid a whole bunch of
+ * redundant loads and stores by leaving this information in registers.
+ */
+typedef struct bin_dalloc_locked_info_s bin_dalloc_locked_info_t;
+struct bin_dalloc_locked_info_s {
+	div_info_t div_info;
+	uint32_t   nregs;
+	uint64_t   ndalloc;
+};
+
+/* Find the region index of a pointer within a slab. */
+JEMALLOC_ALWAYS_INLINE size_t
+bin_slab_regind_impl(
+    div_info_t *div_info, szind_t binind, edata_t *slab, const void *ptr) {
+	size_t diff, regind;
+
+	/* Freeing a pointer outside the slab can cause assertion failure. */
+	assert((uintptr_t)ptr >= (uintptr_t)edata_addr_get(slab));
+	assert((uintptr_t)ptr < (uintptr_t)edata_past_get(slab));
+	/* Freeing an interior pointer can cause assertion failure. */
+	assert(((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab))
+	        % (uintptr_t)bin_infos[binind].reg_size
+	    == 0);
+
+	diff = (size_t)((uintptr_t)ptr - (uintptr_t)edata_addr_get(slab));
+
+	/* Avoid doing division with a variable divisor. */
+	regind = div_compute(div_info, diff);
+	assert(regind < bin_infos[binind].nregs);
+	return regind;
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+bin_slab_regind(bin_dalloc_locked_info_t *info, szind_t binind,
+    edata_t *slab, const void *ptr) {
+	size_t regind = bin_slab_regind_impl(
+	    &info->div_info, binind, slab, ptr);
+	return regind;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_dalloc_locked_begin(
+    bin_dalloc_locked_info_t *info, szind_t binind) {
+	info->div_info = arena_binind_div_info[binind];
+	info->nregs = bin_infos[binind].nregs;
+	info->ndalloc = 0;
+}
+
+/*
+ * Does the deallocation work associated with freeing a single pointer (a
+ * "step") in between a bin_dalloc_locked begin and end call.
+ *
+ * Returns true if arena_slab_dalloc must be called on slab.  Doesn't do
+ * stats updates, which happen during finish (this lets running counts get left
+ * in a register).
+ */
+JEMALLOC_ALWAYS_INLINE bool
+bin_dalloc_locked_step(tsdn_t *tsdn, bool is_auto, bin_t *bin,
+    bin_dalloc_locked_info_t *info, szind_t binind, edata_t *slab,
+    void *ptr) {
+	const bin_info_t *bin_info = &bin_infos[binind];
+	size_t            regind = bin_slab_regind(info, binind, slab, ptr);
+	slab_data_t      *slab_data = edata_slab_data_get(slab);
+
+	assert(edata_nfree_get(slab) < bin_info->nregs);
+	/* Freeing an unallocated pointer can cause assertion failure. */
+	assert(bitmap_get(slab_data->bitmap, &bin_info->bitmap_info, regind));
+
+	bitmap_unset(slab_data->bitmap, &bin_info->bitmap_info, regind);
+	edata_nfree_inc(slab);
+
+	if (config_stats) {
+		info->ndalloc++;
+	}
+
+	unsigned nfree = edata_nfree_get(slab);
+	if (nfree == bin_info->nregs) {
+		bin_dalloc_locked_handle_newly_empty(
+		    tsdn, is_auto, slab, bin);
+		return true;
+	} else if (nfree == 1 && slab != bin->slabcur) {
+		bin_dalloc_locked_handle_newly_nonempty(
+		    tsdn, is_auto, slab, bin);
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE void
+bin_dalloc_locked_finish(tsdn_t *tsdn, bin_t *bin,
+    bin_dalloc_locked_info_t *info) {
+	if (config_stats) {
+		bin->stats.ndalloc += info->ndalloc;
+		assert(bin->stats.curregs >= (size_t)info->ndalloc);
+		bin->stats.curregs -= (size_t)info->ndalloc;
+	}
+}
+
+#endif /* JEMALLOC_INTERNAL_BIN_INLINES_H */
--- a/include/jemalloc/internal/bin_stats.h
+++ b/include/jemalloc/internal/bin_stats.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_BIN_STATS_H
 #define JEMALLOC_INTERNAL_BIN_STATS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/mutex_prof.h"

 typedef struct bin_stats_s bin_stats_t;
@ -11,47 +12,47 @@ struct bin_stats_s {
 	 * many times, resulting many increments to nrequests, but only one
 	 * each to nmalloc and ndalloc.
 	 */
-	uint64_t	nmalloc;
-	uint64_t	ndalloc;
+	uint64_t nmalloc;
+	uint64_t ndalloc;

 	/*
 	 * Number of allocation requests that correspond to the size of this
 	 * bin.  This includes requests served by tcache, though tcache only
 	 * periodically merges into this counter.
 	 */
-	uint64_t	nrequests;
+	uint64_t nrequests;

 	/*
 	 * Current number of regions of this size class, including regions
 	 * currently cached by tcache.
 	 */
-	size_t		curregs;
+	size_t curregs;

 	/* Number of tcache fills from this bin. */
-	uint64_t	nfills;
+	uint64_t nfills;

 	/* Number of tcache flushes to this bin. */
-	uint64_t	nflushes;
+	uint64_t nflushes;

 	/* Total number of slabs created for this bin's size class. */
-	uint64_t	nslabs;
+	uint64_t nslabs;

 	/*
 	 * Total number of slabs reused by extracting them from the slabs heap
 	 * for this bin's size class.
 	 */
-	uint64_t	reslabs;
+	uint64_t reslabs;

 	/* Current number of slabs in this bin. */
-	size_t		curslabs;
+	size_t curslabs;

 	/* Current size of nonfull slabs heap in this bin. */
-	size_t		nonfull_slabs;
+	size_t nonfull_slabs;
 };

 typedef struct bin_stats_data_s bin_stats_data_t;
 struct bin_stats_data_s {
-	bin_stats_t stats_data;
+	bin_stats_t       stats_data;
 	mutex_prof_data_t mutex_data;
 };
 #endif /* JEMALLOC_INTERNAL_BIN_STATS_H */
--- a/include/jemalloc/internal/bin_types.h
+++ b/include/jemalloc/internal/bin_types.h
@ -1,13 +1,17 @@
 #ifndef JEMALLOC_INTERNAL_BIN_TYPES_H
 #define JEMALLOC_INTERNAL_BIN_TYPES_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/sc.h"

 #define BIN_SHARDS_MAX (1 << EDATA_BITS_BINSHARD_WIDTH)
 #define N_BIN_SHARDS_DEFAULT 1

 /* Used in TSD static initializer only. Real init in arena_bind(). */
-#define TSD_BINSHARDS_ZERO_INITIALIZER {{UINT8_MAX}}
+#define TSD_BINSHARDS_ZERO_INITIALIZER                                         \
+	{                                                                      \
+		{ UINT8_MAX }                                                  \
+	}

 typedef struct tsd_binshards_s tsd_binshards_t;
 struct tsd_binshards_s {
--- a/include/jemalloc/internal/bit_util.h
+++ b/include/jemalloc/internal/bit_util.h
@ -1,12 +1,13 @@
 #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
 #define JEMALLOC_INTERNAL_BIT_UTIL_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /* Sanity check. */
-#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
+#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL)      \
    || !defined(JEMALLOC_INTERNAL_FFS)
-#  error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
+#	error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
 #endif

 /*
@ -34,6 +35,7 @@ ffs_u(unsigned x) {
 	return JEMALLOC_INTERNAL_FFS(x) - 1;
 }

+/* clang-format off */
 #define DO_FLS_SLOW(x, suffix) do {					\
 	util_assume(x != 0);						\
 	x |= (x >> 1);							\
@ -57,6 +59,7 @@ ffs_u(unsigned x) {
 	}								\
 	return ffs_##suffix(x) - 1;					\
 } while(0)
+/* clang-format on */

 static inline unsigned
 fls_llu_slow(unsigned long long x) {
@ -107,16 +110,19 @@ fls_u(unsigned x) {
 }
 #elif defined(_MSC_VER)

-#if LG_SIZEOF_PTR == 3
-#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
-#else
+#	if LG_SIZEOF_PTR == 3
+#		define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
+#	else
 /*
 * This never actually runs; we're just dodging a compiler error for the
 * never-taken branch where sizeof(void *) == 8.
 */
-#define DO_BSR64(bit, x) bit = 0; unreachable()
-#endif
+#		define DO_BSR64(bit, x)                                       \
+			bit = 0;                                               \
+			unreachable()
+#	endif

+/* clang-format off */
 #define DO_FLS(x) do {							\
 	if (x == 0) {							\
 		return 8 * sizeof(x);					\
@ -143,6 +149,7 @@ fls_u(unsigned x) {
 	}								\
 	unreachable();							\
 } while (0)
+/* clang-format on */

 static inline unsigned
 fls_llu(unsigned long long x) {
@ -159,8 +166,8 @@ fls_u(unsigned x) {
 	DO_FLS(x);
 }

-#undef DO_FLS
-#undef DO_BSR64
+#	undef DO_FLS
+#	undef DO_BSR64
 #else

 static inline unsigned
@ -180,9 +187,10 @@ fls_u(unsigned x) {
 #endif

 #if LG_SIZEOF_LONG_LONG > 3
-#  error "Haven't implemented popcount for 16-byte ints."
+#	error "Haven't implemented popcount for 16-byte ints."
 #endif

+/* clang-format off */
 #define DO_POPCOUNT(x, type) do {					\
 	/*								\
 	 * Algorithm from an old AMD optimization reference manual.	\
@ -226,6 +234,7 @@ fls_u(unsigned x) {
 	x >>= ((sizeof(x) - 1) * 8);					\
 	return (unsigned)x;						\
 } while(0)
+/* clang-format on */

 static inline unsigned
 popcount_u_slow(unsigned bitmap) {
@ -277,7 +286,7 @@ popcount_llu(unsigned long long bitmap) {
 */

 static inline size_t
-cfs_lu(unsigned long* bitmap) {
+cfs_lu(unsigned long *bitmap) {
 	util_assume(*bitmap != 0);
 	size_t bit = ffs_lu(*bitmap);
 	*bitmap ^= ZU(1) << bit;
@ -293,7 +302,7 @@ ffs_zu(size_t x) {
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
 	return ffs_llu(x);
 #else
-#error No implementation for size_t ffs()
+#	error No implementation for size_t ffs()
 #endif
 }

@ -306,11 +315,10 @@ fls_zu(size_t x) {
 #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
 	return fls_llu(x);
 #else
-#error No implementation for size_t fls()
+#	error No implementation for size_t fls()
 #endif
 }

-
 static inline unsigned
 ffs_u64(uint64_t x) {
 #if LG_SIZEOF_LONG == 3
@ -318,7 +326,7 @@ ffs_u64(uint64_t x) {
 #elif LG_SIZEOF_LONG_LONG == 3
 	return ffs_llu(x);
 #else
-#error No implementation for 64-bit ffs()
+#	error No implementation for 64-bit ffs()
 #endif
 }

@ -329,7 +337,7 @@ fls_u64(uint64_t x) {
 #elif LG_SIZEOF_LONG_LONG == 3
 	return fls_llu(x);
 #else
-#error No implementation for 64-bit fls()
+#	error No implementation for 64-bit fls()
 #endif
 }

@ -338,9 +346,8 @@ ffs_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
 	return ffs_u(x);
 #else
-#error No implementation for 32-bit ffs()
+#	error No implementation for 32-bit ffs()
 #endif
-	return ffs_u(x);
 }

 static inline unsigned
@ -348,9 +355,8 @@ fls_u32(uint32_t x) {
 #if LG_SIZEOF_INT == 2
 	return fls_u(x);
 #else
-#error No implementation for 32-bit fls()
+#	error No implementation for 32-bit fls()
 #endif
-	return fls_u(x);
 }

 static inline uint64_t
@ -370,7 +376,7 @@ pow2_ceil_u64(uint64_t x) {
 static inline uint32_t
 pow2_ceil_u32(uint32_t x) {
 	if (unlikely(x <= 1)) {
-	    return x;
+		return x;
 	}
 	size_t msb_on_index = fls_u32(x - 1);
 	/* As above. */
@ -408,13 +414,16 @@ lg_ceil(size_t x) {
 #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
 #define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
 #define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
-#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
-#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
-#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
+#define LG_FLOOR_16(x)                                                         \
+	(x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
+#define LG_FLOOR_32(x)                                                         \
+	(x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
+#define LG_FLOOR_64(x)                                                         \
+	(x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
 #if LG_SIZEOF_PTR == 2
-#  define LG_FLOOR(x) LG_FLOOR_32((x))
+#	define LG_FLOOR(x) LG_FLOOR_32((x))
 #else
-#  define LG_FLOOR(x) LG_FLOOR_64((x))
+#	define LG_FLOOR(x) LG_FLOOR_64((x))
 #endif

 #define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
--- a/include/jemalloc/internal/bitmap.h
+++ b/include/jemalloc/internal/bitmap.h
@ -1,26 +1,27 @@
 #ifndef JEMALLOC_INTERNAL_BITMAP_H
 #define JEMALLOC_INTERNAL_BITMAP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/sc.h"

 typedef unsigned long bitmap_t;
-#define LG_SIZEOF_BITMAP	LG_SIZEOF_LONG
+#define LG_SIZEOF_BITMAP LG_SIZEOF_LONG

 /* Maximum bitmap bit count is 2^LG_BITMAP_MAXBITS. */
 #if SC_LG_SLAB_MAXREGS > LG_CEIL(SC_NSIZES)
 /* Maximum bitmap bit count is determined by maximum regions per slab. */
-#  define LG_BITMAP_MAXBITS	SC_LG_SLAB_MAXREGS
+#	define LG_BITMAP_MAXBITS SC_LG_SLAB_MAXREGS
 #else
 /* Maximum bitmap bit count is determined by number of extent size classes. */
-#  define LG_BITMAP_MAXBITS	LG_CEIL(SC_NSIZES)
+#	define LG_BITMAP_MAXBITS LG_CEIL(SC_NSIZES)
 #endif
-#define BITMAP_MAXBITS		(ZU(1) << LG_BITMAP_MAXBITS)
+#define BITMAP_MAXBITS (ZU(1) << LG_BITMAP_MAXBITS)

 /* Number of bits per group. */
-#define LG_BITMAP_GROUP_NBITS		(LG_SIZEOF_BITMAP + 3)
-#define BITMAP_GROUP_NBITS		(1U << LG_BITMAP_GROUP_NBITS)
-#define BITMAP_GROUP_NBITS_MASK		(BITMAP_GROUP_NBITS-1)
+#define LG_BITMAP_GROUP_NBITS (LG_SIZEOF_BITMAP + 3)
+#define BITMAP_GROUP_NBITS (1U << LG_BITMAP_GROUP_NBITS)
+#define BITMAP_GROUP_NBITS_MASK (BITMAP_GROUP_NBITS - 1)

 /*
 * Do some analysis on how big the bitmap is before we use a tree.  For a brute
@ -28,67 +29,64 @@ typedef unsigned long bitmap_t;
 * use a tree instead.
 */
 #if LG_BITMAP_MAXBITS - LG_BITMAP_GROUP_NBITS > 3
-#  define BITMAP_USE_TREE
+#	define BITMAP_USE_TREE
 #endif

 /* Number of groups required to store a given number of bits. */
-#define BITMAP_BITS2GROUPS(nbits)					\
-    (((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)
+#define BITMAP_BITS2GROUPS(nbits)                                              \
+	(((nbits) + BITMAP_GROUP_NBITS_MASK) >> LG_BITMAP_GROUP_NBITS)

 /*
 * Number of groups required at a particular level for a given number of bits.
 */
-#define BITMAP_GROUPS_L0(nbits)						\
-    BITMAP_BITS2GROUPS(nbits)
-#define BITMAP_GROUPS_L1(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
-#define BITMAP_GROUPS_L2(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
-#define BITMAP_GROUPS_L3(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
-	BITMAP_BITS2GROUPS((nbits)))))
-#define BITMAP_GROUPS_L4(nbits)						\
-    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(		\
-	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))))
+#define BITMAP_GROUPS_L0(nbits) BITMAP_BITS2GROUPS(nbits)
+#define BITMAP_GROUPS_L1(nbits) BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(nbits))
+#define BITMAP_GROUPS_L2(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))
+#define BITMAP_GROUPS_L3(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(                                 \
+	    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits)))))
+#define BITMAP_GROUPS_L4(nbits)                                                \
+	BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS(              \
+	    BITMAP_BITS2GROUPS(BITMAP_BITS2GROUPS((nbits))))))

 /*
 * Assuming the number of levels, number of groups required for a given number
 * of bits.
 */
-#define BITMAP_GROUPS_1_LEVEL(nbits)					\
-    BITMAP_GROUPS_L0(nbits)
-#define BITMAP_GROUPS_2_LEVEL(nbits)					\
-    (BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
-#define BITMAP_GROUPS_3_LEVEL(nbits)					\
-    (BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
-#define BITMAP_GROUPS_4_LEVEL(nbits)					\
-    (BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
-#define BITMAP_GROUPS_5_LEVEL(nbits)					\
-    (BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits))
+#define BITMAP_GROUPS_1_LEVEL(nbits) BITMAP_GROUPS_L0(nbits)
+#define BITMAP_GROUPS_2_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_1_LEVEL(nbits) + BITMAP_GROUPS_L1(nbits))
+#define BITMAP_GROUPS_3_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_2_LEVEL(nbits) + BITMAP_GROUPS_L2(nbits))
+#define BITMAP_GROUPS_4_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_3_LEVEL(nbits) + BITMAP_GROUPS_L3(nbits))
+#define BITMAP_GROUPS_5_LEVEL(nbits)                                           \
+	(BITMAP_GROUPS_4_LEVEL(nbits) + BITMAP_GROUPS_L4(nbits))

 /*
 * Maximum number of groups required to support LG_BITMAP_MAXBITS.
 */
 #ifdef BITMAP_USE_TREE

-#if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
-#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_1_LEVEL(nbits)
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
-#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_2_LEVEL(nbits)
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
-#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_3_LEVEL(nbits)
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
-#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_4_LEVEL(nbits)
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
-#elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5
-#  define BITMAP_GROUPS(nbits)	BITMAP_GROUPS_5_LEVEL(nbits)
-#  define BITMAP_GROUPS_MAX	BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS)
-#else
-#  error "Unsupported bitmap size"
-#endif
+#	if LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_1_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_1_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 2
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_2_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_2_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 3
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_3_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_3_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 4
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_4_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_4_LEVEL(BITMAP_MAXBITS)
+#	elif LG_BITMAP_MAXBITS <= LG_BITMAP_GROUP_NBITS * 5
+#		define BITMAP_GROUPS(nbits) BITMAP_GROUPS_5_LEVEL(nbits)
+#		define BITMAP_GROUPS_MAX BITMAP_GROUPS_5_LEVEL(BITMAP_MAXBITS)
+#	else
+#		error "Unsupported bitmap size"
+#	endif

 /*
 * Maximum number of levels possible.  This could be statically computed based
@ -104,42 +102,53 @@ typedef unsigned long bitmap_t;
 * unused trailing entries in bitmap_info_t structures; the bitmaps themselves
 * are not impacted.
 */
-#define BITMAP_MAX_LEVELS	5
+#	define BITMAP_MAX_LEVELS 5

-#define BITMAP_INFO_INITIALIZER(nbits) {				\
-	/* nbits. */							\
-	nbits,								\
-	/* nlevels. */							\
-	(BITMAP_GROUPS_L0(nbits) > BITMAP_GROUPS_L1(nbits)) +		\
-	    (BITMAP_GROUPS_L1(nbits) > BITMAP_GROUPS_L2(nbits)) +	\
-	    (BITMAP_GROUPS_L2(nbits) > BITMAP_GROUPS_L3(nbits)) +	\
-	    (BITMAP_GROUPS_L3(nbits) > BITMAP_GROUPS_L4(nbits)) + 1,	\
-	/* levels. */							\
-	{								\
-		{0},							\
-		{BITMAP_GROUPS_L0(nbits)},				\
-		{BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
-		{BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits) +	\
-		    BITMAP_GROUPS_L0(nbits)},				\
-		{BITMAP_GROUPS_L3(nbits) + BITMAP_GROUPS_L2(nbits) +	\
-		    BITMAP_GROUPS_L1(nbits) + BITMAP_GROUPS_L0(nbits)},	\
-		{BITMAP_GROUPS_L4(nbits) + BITMAP_GROUPS_L3(nbits) +	\
-		     BITMAP_GROUPS_L2(nbits) + BITMAP_GROUPS_L1(nbits)	\
-		     + BITMAP_GROUPS_L0(nbits)}				\
-	}								\
-}
+#	define BITMAP_INFO_INITIALIZER(nbits)                                 \
+		{                                                              \
+			/* nbits. */                                           \
+			nbits, /* nlevels. */                                  \
+			    (BITMAP_GROUPS_L0(nbits)                           \
+			        > BITMAP_GROUPS_L1(nbits))                     \
+			    + (BITMAP_GROUPS_L1(nbits)                         \
+			        > BITMAP_GROUPS_L2(nbits))                     \
+			    + (BITMAP_GROUPS_L2(nbits)                         \
+			        > BITMAP_GROUPS_L3(nbits))                     \
+			    + (BITMAP_GROUPS_L3(nbits)                         \
+			        > BITMAP_GROUPS_L4(nbits))                     \
+			    + 1, /* levels. */                                 \
+			{                                                      \
+				{0}, {BITMAP_GROUPS_L0(nbits)},                \
+				    {BITMAP_GROUPS_L1(nbits)                   \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				    {BITMAP_GROUPS_L2(nbits)                   \
+				        + BITMAP_GROUPS_L1(nbits)              \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				    {BITMAP_GROUPS_L3(nbits)                   \
+				        + BITMAP_GROUPS_L2(nbits)              \
+				        + BITMAP_GROUPS_L1(nbits)              \
+				        + BITMAP_GROUPS_L0(nbits)},            \
+				{                                              \
+					BITMAP_GROUPS_L4(nbits)                \
+					    + BITMAP_GROUPS_L3(nbits)          \
+					    + BITMAP_GROUPS_L2(nbits)          \
+					    + BITMAP_GROUPS_L1(nbits)          \
+					    + BITMAP_GROUPS_L0(nbits)          \
+				}                                              \
+			}                                                      \
+		}

 #else /* BITMAP_USE_TREE */

-#define BITMAP_GROUPS(nbits)	BITMAP_BITS2GROUPS(nbits)
-#define BITMAP_GROUPS_MAX	BITMAP_BITS2GROUPS(BITMAP_MAXBITS)
+#	define BITMAP_GROUPS(nbits) BITMAP_BITS2GROUPS(nbits)
+#	define BITMAP_GROUPS_MAX BITMAP_BITS2GROUPS(BITMAP_MAXBITS)

-#define BITMAP_INFO_INITIALIZER(nbits) {				\
-	/* nbits. */							\
-	nbits,								\
-	/* ngroups. */							\
-	BITMAP_BITS2GROUPS(nbits)					\
-}
+#	define BITMAP_INFO_INITIALIZER(nbits)                                 \
+		{                                                              \
+			/* nbits. */                                           \
+			nbits, /* ngroups. */                                  \
+			    BITMAP_BITS2GROUPS(nbits)                          \
+		}

 #endif /* BITMAP_USE_TREE */

@ -160,21 +169,21 @@ typedef struct bitmap_info_s {
 	 * Only the first (nlevels+1) elements are used, and levels are ordered
 	 * bottom to top (e.g. the bottom level is stored in levels[0]).
 	 */
-	bitmap_level_t levels[BITMAP_MAX_LEVELS+1];
-#else /* BITMAP_USE_TREE */
+	bitmap_level_t levels[BITMAP_MAX_LEVELS + 1];
+#else  /* BITMAP_USE_TREE */
 	/* Number of groups necessary for nbits. */
 	size_t ngroups;
 #endif /* BITMAP_USE_TREE */
 } bitmap_info_t;

-void bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
-void bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill);
+void   bitmap_info_init(bitmap_info_t *binfo, size_t nbits);
+void   bitmap_init(bitmap_t *bitmap, const bitmap_info_t *binfo, bool fill);
 size_t bitmap_size(const bitmap_info_t *binfo);

 static inline bool
 bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {
 #ifdef BITMAP_USE_TREE
-	size_t rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
+	size_t   rgoff = binfo->levels[binfo->nlevels].group_offset - 1;
 	bitmap_t rg = bitmap[rgoff];
 	/* The bitmap is full iff the root group is 0. */
 	return (rg == 0);
@ -192,7 +201,7 @@ bitmap_full(bitmap_t *bitmap, const bitmap_info_t *binfo) {

 static inline bool
 bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
-	size_t goff;
+	size_t   goff;
 	bitmap_t g;

 	assert(bit < binfo->nbits);
@ -203,9 +212,9 @@ bitmap_get(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {

 static inline void
 bitmap_set(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
-	size_t goff;
+	size_t    goff;
 	bitmap_t *gp;
-	bitmap_t g;
+	bitmap_t  g;

 	assert(bit < binfo->nbits);
 	assert(!bitmap_get(bitmap, binfo, bit));
@ -244,12 +253,13 @@ bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
 #ifdef BITMAP_USE_TREE
 	size_t bit = 0;
 	for (unsigned level = binfo->nlevels; level--;) {
-		size_t lg_bits_per_group = (LG_BITMAP_GROUP_NBITS * (level +
-		    1));
-		bitmap_t group = bitmap[binfo->levels[level].group_offset + (bit
-		    >> lg_bits_per_group)];
-		unsigned group_nmask = (unsigned)(((min_bit > bit) ? (min_bit -
-		    bit) : 0) >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS));
+		size_t   lg_bits_per_group = (LG_BITMAP_GROUP_NBITS
+                    * (level + 1));
+		bitmap_t group = bitmap[binfo->levels[level].group_offset
+		    + (bit >> lg_bits_per_group)];
+		unsigned group_nmask =
+		    (unsigned)(((min_bit > bit) ? (min_bit - bit) : 0)
+		        >> (lg_bits_per_group - LG_BITMAP_GROUP_NBITS));
 		assert(group_nmask <= BITMAP_GROUP_NBITS);
 		bitmap_t group_mask = ~((1LU << group_nmask) - 1);
 		bitmap_t group_masked = group & group_mask;
@ -272,25 +282,28 @@ bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
 			}
 			return bitmap_ffu(bitmap, binfo, sib_base);
 		}
-		bit += ((size_t)ffs_lu(group_masked)) <<
-		    (lg_bits_per_group - LG_BITMAP_GROUP_NBITS);
+		bit += ((size_t)ffs_lu(group_masked))
+		    << (lg_bits_per_group - LG_BITMAP_GROUP_NBITS);
 	}
 	assert(bit >= min_bit);
 	assert(bit < binfo->nbits);
 	return bit;
 #else
-	size_t i = min_bit >> LG_BITMAP_GROUP_NBITS;
-	bitmap_t g = bitmap[i] & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK))
-	    - 1);
+	size_t   i = min_bit >> LG_BITMAP_GROUP_NBITS;
+	bitmap_t g = bitmap[i]
+	    & ~((1LU << (min_bit & BITMAP_GROUP_NBITS_MASK)) - 1);
 	size_t bit;
-	do {
+	while (1) {
 		if (g != 0) {
 			bit = ffs_lu(g);
 			return (i << LG_BITMAP_GROUP_NBITS) + bit;
 		}
 		i++;
+		if (i >= binfo->ngroups) {
+			break;
+		}
 		g = bitmap[i];
-	} while (i < binfo->ngroups);
+	}
 	return binfo->nbits;
 #endif
 }
@ -298,7 +311,7 @@ bitmap_ffu(const bitmap_t *bitmap, const bitmap_info_t *binfo, size_t min_bit) {
 /* sfu: set first unset. */
 static inline size_t
 bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {
-	size_t bit;
+	size_t   bit;
 	bitmap_t g;
 	unsigned i;

@ -328,9 +341,9 @@ bitmap_sfu(bitmap_t *bitmap, const bitmap_info_t *binfo) {

 static inline void
 bitmap_unset(bitmap_t *bitmap, const bitmap_info_t *binfo, size_t bit) {
-	size_t goff;
-	bitmap_t *gp;
-	bitmap_t g;
+	size_t      goff;
+	bitmap_t   *gp;
+	bitmap_t    g;
 	UNUSED bool propagate;

 	assert(bit < binfo->nbits);
--- a/include/jemalloc/internal/buf_writer.h
+++ b/include/jemalloc/internal/buf_writer.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_BUF_WRITER_H
 #define JEMALLOC_INTERNAL_BUF_WRITER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * Note: when using the buffered writer, cbopaque is passed to write_cb only
 * when the buffer is flushed.  It would make a difference if cbopaque points
@ -12,21 +16,21 @@

 typedef struct {
 	write_cb_t *write_cb;
-	void *cbopaque;
-	char *buf;
-	size_t buf_size;
-	size_t buf_end;
-	bool internal_buf;
+	void       *cbopaque;
+	char       *buf;
+	size_t      buf_size;
+	size_t      buf_end;
+	bool        internal_buf;
 } buf_writer_t;

-bool buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer,
-    write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len);
-void buf_writer_flush(buf_writer_t *buf_writer);
+bool       buf_writer_init(tsdn_t *tsdn, buf_writer_t *buf_writer,
+          write_cb_t *write_cb, void *cbopaque, char *buf, size_t buf_len);
+void       buf_writer_flush(buf_writer_t *buf_writer);
 write_cb_t buf_writer_cb;
-void buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer);
+void       buf_writer_terminate(tsdn_t *tsdn, buf_writer_t *buf_writer);

-typedef ssize_t (read_cb_t)(void *read_cbopaque, void *buf, size_t limit);
-void buf_writer_pipe(buf_writer_t *buf_writer, read_cb_t *read_cb,
-    void *read_cbopaque);
+typedef ssize_t(read_cb_t)(void *read_cbopaque, void *buf, size_t limit);
+void buf_writer_pipe(
+    buf_writer_t *buf_writer, read_cb_t *read_cb, void *read_cbopaque);

 #endif /* JEMALLOC_INTERNAL_BUF_WRITER_H */
--- a/include/jemalloc/internal/cache_bin.h
+++ b/include/jemalloc/internal/cache_bin.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_CACHE_BIN_H
 #define JEMALLOC_INTERNAL_CACHE_BIN_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/jemalloc_internal_externs.h"
 #include "jemalloc/internal/ql.h"
+#include "jemalloc/internal/safety_check.h"
 #include "jemalloc/internal/sz.h"

 /*
@ -20,16 +23,20 @@
 */
 typedef uint16_t cache_bin_sz_t;

+#define JUNK_ADDR ((uintptr_t)0x7a7a7a7a7a7a7a7aULL)
 /*
 * Leave a noticeable mark pattern on the cache bin stack boundaries, in case a
 * bug starts leaking those.  Make it look like the junk pattern but be distinct
 * from it.
 */
-static const uintptr_t cache_bin_preceding_junk =
-    (uintptr_t)0x7a7a7a7a7a7a7a7aULL;
-/* Note: a7 vs. 7a above -- this tells you which pointer leaked. */
-static const uintptr_t cache_bin_trailing_junk =
-    (uintptr_t)0xa7a7a7a7a7a7a7a7ULL;
+static const uintptr_t cache_bin_preceding_junk = JUNK_ADDR;
+/* Note: JUNK_ADDR vs. JUNK_ADDR + 1 -- this tells you which pointer leaked. */
+static const uintptr_t cache_bin_trailing_junk = JUNK_ADDR + 1;
+/*
+ * A pointer used to initialize a fake stack_head for disabled small bins
+ * so that the enabled/disabled assessment does not rely on ncached_max.
+ */
+extern const uintptr_t disabled_bin;

 /*
 * That implies the following value, for the maximum number of items in any
@ -38,8 +45,8 @@ static const uintptr_t cache_bin_trailing_junk =
 *   1 << (sizeof(cache_bin_sz_t) * 8)
 * bytes spread across pointer sized objects to get the maximum.
 */
-#define CACHE_BIN_NCACHED_MAX (((size_t)1 << sizeof(cache_bin_sz_t) * 8) \
-    / sizeof(void *) - 1)
+#define CACHE_BIN_NCACHED_MAX                                                  \
+	(((size_t)1 << sizeof(cache_bin_sz_t) * 8) / sizeof(void *) - 1)

 /*
 * This lives inside the cache_bin (for locality reasons), and is initialized
@ -101,7 +108,7 @@ struct cache_bin_s {
 	 * Since the stack grows down, this is a higher address than
 	 * low_bits_full.
 	 */
-	uint16_t low_bits_low_water;
+	cache_bin_sz_t low_bits_low_water;

 	/*
 	 * The low bits of the value that stack_head will take on when the array
@ -112,7 +119,7 @@ struct cache_bin_s {
 	 * Recall that since the stack grows down, this is the lowest available
 	 * address in the array for caching.  Only adjusted when stashing items.
 	 */
-	uint16_t low_bits_full;
+	cache_bin_sz_t low_bits_full;

 	/*
 	 * The low bits of the value that stack_head will take on when the array
@ -121,7 +128,10 @@ struct cache_bin_s {
 	 * The stack grows down -- this is one past the highest address in the
 	 * array.  Immutable after initialization.
 	 */
-	uint16_t low_bits_empty;
+	cache_bin_sz_t low_bits_empty;
+
+	/* The maximum number of cached items in the bin. */
+	cache_bin_info_t bin_info;
 };

 /*
@ -142,8 +152,8 @@ struct cache_bin_array_descriptor_s {
 };

 static inline void
-cache_bin_array_descriptor_init(cache_bin_array_descriptor_t *descriptor,
-    cache_bin_t *bins) {
+cache_bin_array_descriptor_init(
+    cache_bin_array_descriptor_t *descriptor, cache_bin_t *bins) {
 	ql_elm_new(descriptor, link);
 	descriptor->bins = bins;
 }
@ -168,10 +178,41 @@ cache_bin_nonfast_aligned(const void *ptr) {
 	return ((uintptr_t)ptr & san_cache_bin_nonfast_mask) == 0;
 }

+static inline const void *
+cache_bin_disabled_bin_stack(void) {
+	return &disabled_bin;
+}
+
+/*
+ * If a cache bin was zero initialized (either because it lives in static or
+ * thread-local storage, or was memset to 0), this function indicates whether or
+ * not cache_bin_init was called on it.
+ */
+static inline bool
+cache_bin_still_zero_initialized(cache_bin_t *bin) {
+	return bin->stack_head == NULL;
+}
+
+static inline bool
+cache_bin_disabled(cache_bin_t *bin) {
+	bool disabled = (bin->stack_head == cache_bin_disabled_bin_stack());
+	if (disabled) {
+		assert((uintptr_t)(*bin->stack_head) == JUNK_ADDR);
+	}
+	return disabled;
+}
+
+/* Gets ncached_max without asserting that the bin is enabled. */
+static inline cache_bin_sz_t
+cache_bin_ncached_max_get_unsafe(cache_bin_t *bin) {
+	return bin->bin_info.ncached_max;
+}
+
 /* Returns ncached_max: Upper limit on ncached. */
 static inline cache_bin_sz_t
-cache_bin_info_ncached_max(cache_bin_info_t *info) {
-	return info->ncached_max;
+cache_bin_ncached_max_get(cache_bin_t *bin) {
+	assert(!cache_bin_disabled(bin));
+	return cache_bin_ncached_max_get_unsafe(bin);
 }

 /*
@ -181,7 +222,8 @@ cache_bin_info_ncached_max(cache_bin_info_t *info) {
 * with later.
 */
 static inline void
-cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
+cache_bin_assert_earlier(
+    cache_bin_t *bin, cache_bin_sz_t earlier, cache_bin_sz_t later) {
 	if (earlier > later) {
 		assert(bin->low_bits_full > bin->low_bits_empty);
 	}
@ -193,28 +235,19 @@ cache_bin_assert_earlier(cache_bin_t *bin, uint16_t earlier, uint16_t later) {
 * Does difference calculations that handle wraparound correctly.  Earlier must
 * be associated with the position earlier in memory.
 */
-static inline uint16_t
-cache_bin_diff(cache_bin_t *bin, uint16_t earlier, uint16_t later, bool racy) {
-	/*
-	 * When it's racy, bin->low_bits_full can be modified concurrently. It
-	 * can cross the uint16_t max value and become less than
-	 * bin->low_bits_empty at the time of the check.
-	 */
-	if (!racy) {
-		cache_bin_assert_earlier(bin, earlier, later);
-	}
+static inline cache_bin_sz_t
+cache_bin_diff(cache_bin_t *bin, cache_bin_sz_t earlier, cache_bin_sz_t later) {
+	cache_bin_assert_earlier(bin, earlier, later);
 	return later - earlier;
 }

 /*
 * Number of items currently cached in the bin, without checking ncached_max.
- * We require specifying whether or not the request is racy or not (i.e. whether
- * or not concurrent modifications are possible).
 */
 static inline cache_bin_sz_t
-cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
+cache_bin_ncached_get_internal(cache_bin_t *bin) {
 	cache_bin_sz_t diff = cache_bin_diff(bin,
-	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty, racy);
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
 	cache_bin_sz_t n = diff / sizeof(void *);
 	/*
 	 * We have undefined behavior here; if this function is called from the
@ -225,7 +258,7 @@ cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
 	 * fast paths.  This should still be "safe" in the sense of generating
 	 * the correct assembly for the foreseeable future, though.
 	 */
-	assert(n == 0 || *(bin->stack_head) != NULL || racy);
+	assert(n == 0 || *(bin->stack_head) != NULL);
 	return n;
 }

@ -235,10 +268,9 @@ cache_bin_ncached_get_internal(cache_bin_t *bin, bool racy) {
 * possible.
 */
 static inline cache_bin_sz_t
-cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
-	    /* racy */ false);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_ncached_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
 	return n;
 }

@ -253,10 +285,9 @@ cache_bin_ncached_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
 static inline void **
 cache_bin_empty_position_get(cache_bin_t *bin) {
 	cache_bin_sz_t diff = cache_bin_diff(bin,
-	    (uint16_t)(uintptr_t)bin->stack_head, bin->low_bits_empty,
-	    /* racy */ false);
-	uintptr_t empty_bits = (uintptr_t)bin->stack_head + diff;
-	void **ret = (void **)empty_bits;
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head, bin->low_bits_empty);
+	byte_t        *empty_bits = (byte_t *)bin->stack_head + diff;
+	void         **ret = (void **)empty_bits;

 	assert(ret >= bin->stack_head);

@ -273,10 +304,10 @@ cache_bin_empty_position_get(cache_bin_t *bin) {
 * multithreaded environment. Currently concurrent access happens only during
 * arena statistics collection.
 */
-static inline uint16_t
-cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
-	return (uint16_t)bin->low_bits_empty -
-	    info->ncached_max * sizeof(void *);
+static inline cache_bin_sz_t
+cache_bin_low_bits_low_bound_get(cache_bin_t *bin) {
+	return (cache_bin_sz_t)bin->low_bits_empty
+	    - cache_bin_ncached_max_get(bin) * sizeof(void *);
 }

 /*
@ -285,9 +316,9 @@ cache_bin_low_bits_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
 * A pointer to the position with the lowest address of the backing array.
 */
 static inline void **
-cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
-	void **ret = cache_bin_empty_position_get(bin) - ncached_max;
+cache_bin_low_bound_get(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
+	void         **ret = cache_bin_empty_position_get(bin) - ncached_max;
 	assert(ret <= bin->stack_head);

 	return ret;
@ -298,8 +329,8 @@ cache_bin_low_bound_get(cache_bin_t *bin, cache_bin_info_t *info) {
 * batch fill a nonempty cache bin.
 */
 static inline void
-cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
-	assert(cache_bin_ncached_get_local(bin, info) == 0);
+cache_bin_assert_empty(cache_bin_t *bin) {
+	assert(cache_bin_ncached_get_local(bin) == 0);
 	assert(cache_bin_empty_position_get(bin) == bin->stack_head);
 }

@ -310,18 +341,19 @@ cache_bin_assert_empty(cache_bin_t *bin, cache_bin_info_t *info) {
 */
 static inline cache_bin_sz_t
 cache_bin_low_water_get_internal(cache_bin_t *bin) {
-	return cache_bin_diff(bin, bin->low_bits_low_water,
-	    bin->low_bits_empty, /* racy */ false) / sizeof(void *);
+	return cache_bin_diff(bin, bin->low_bits_low_water, bin->low_bits_empty)
+	    / sizeof(void *);
 }

 /* Returns the numeric value of low water in [0, ncached]. */
 static inline cache_bin_sz_t
-cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
+cache_bin_low_water_get(cache_bin_t *bin) {
 	cache_bin_sz_t low_water = cache_bin_low_water_get_internal(bin);
-	assert(low_water <= cache_bin_info_ncached_max(info));
-	assert(low_water <= cache_bin_ncached_get_local(bin, info));
+	assert(low_water <= cache_bin_ncached_max_get(bin));
+	assert(low_water <= cache_bin_ncached_get_local(bin));

-	cache_bin_assert_earlier(bin, (uint16_t)(uintptr_t)bin->stack_head,
+	cache_bin_assert_earlier(bin,
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head,
 	    bin->low_bits_low_water);

 	return low_water;
@ -333,12 +365,14 @@ cache_bin_low_water_get(cache_bin_t *bin, cache_bin_info_t *info) {
 */
 static inline void
 cache_bin_low_water_set(cache_bin_t *bin) {
-	bin->low_bits_low_water = (uint16_t)(uintptr_t)bin->stack_head;
+	assert(!cache_bin_disabled(bin));
+	bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)bin->stack_head;
 }

 static inline void
 cache_bin_low_water_adjust(cache_bin_t *bin) {
-	if (cache_bin_ncached_get_internal(bin, /* racy */ false)
+	assert(!cache_bin_disabled(bin));
+	if (cache_bin_ncached_get_internal(bin)
 	    < cache_bin_low_water_get_internal(bin)) {
 		cache_bin_low_water_set(bin);
 	}
@ -358,9 +392,9 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
 	 * This may read from the empty position; however the loaded value won't
 	 * be used.  It's safe because the stack has one more slot reserved.
 	 */
-	void *ret = *bin->stack_head;
-	uint16_t low_bits = (uint16_t)(uintptr_t)bin->stack_head;
-	void **new_head = bin->stack_head + 1;
+	void          *ret = *bin->stack_head;
+	cache_bin_sz_t low_bits = (cache_bin_sz_t)(uintptr_t)bin->stack_head;
+	void         **new_head = bin->stack_head + 1;

 	/*
 	 * Note that the low water mark is at most empty; if we pass this check,
@ -382,7 +416,7 @@ cache_bin_alloc_impl(cache_bin_t *bin, bool *success, bool adjust_low_water) {
 	 */
 	if (likely(low_bits != bin->low_bits_empty)) {
 		bin->stack_head = new_head;
-		bin->low_bits_low_water = (uint16_t)(uintptr_t)new_head;
+		bin->low_bits_low_water = (cache_bin_sz_t)(uintptr_t)new_head;
 		*success = true;
 		return ret;
 	}
@ -410,8 +444,7 @@ cache_bin_alloc(cache_bin_t *bin, bool *success) {

 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
 cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin,
-	    /* racy */ false);
+	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin);
 	if (n > num) {
 		n = (cache_bin_sz_t)num;
 	}
@ -424,7 +457,37 @@ cache_bin_alloc_batch(cache_bin_t *bin, size_t num, void **out) {

 JEMALLOC_ALWAYS_INLINE bool
 cache_bin_full(cache_bin_t *bin) {
-	return ((uint16_t)(uintptr_t)bin->stack_head == bin->low_bits_full);
+	return (
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head == bin->low_bits_full);
+}
+
+/*
+ * Scans the allocated area of the cache_bin for the given pointer up to limit.
+ * Fires safety_check_fail if the ptr is found and returns true.
+ */
+JEMALLOC_ALWAYS_INLINE bool
+cache_bin_dalloc_safety_checks(cache_bin_t *bin, void *ptr) {
+	if (!config_debug || opt_debug_double_free_max_scan == 0) {
+		return false;
+	}
+
+	cache_bin_sz_t ncached = cache_bin_ncached_get_internal(bin);
+	unsigned       max_scan = opt_debug_double_free_max_scan < ncached
+	          ? opt_debug_double_free_max_scan
+	          : ncached;
+
+	void **cur = bin->stack_head;
+	void **limit = cur + max_scan;
+	for (; cur < limit; cur++) {
+		if (*cur == ptr) {
+			safety_check_fail(
+			    "Invalid deallocation detected: double free of "
+			    "pointer %p\n",
+			    ptr);
+			return true;
+		}
+	}
+	return false;
 }

 /*
@ -436,10 +499,14 @@ cache_bin_dalloc_easy(cache_bin_t *bin, void *ptr) {
 		return false;
 	}

+	if (unlikely(cache_bin_dalloc_safety_checks(bin, ptr))) {
+		return true;
+	}
+
 	bin->stack_head--;
 	*bin->stack_head = ptr;
 	cache_bin_assert_earlier(bin, bin->low_bits_full,
-	    (uint16_t)(uintptr_t)bin->stack_head);
+	    (cache_bin_sz_t)(uintptr_t)bin->stack_head);

 	return true;
 }
@ -452,11 +519,12 @@ cache_bin_stash(cache_bin_t *bin, void *ptr) {
 	}

 	/* Stash at the full position, in the [full, head) range. */
-	uint16_t low_bits_head = (uint16_t)(uintptr_t)bin->stack_head;
+	cache_bin_sz_t low_bits_head = (cache_bin_sz_t)(uintptr_t)
+	                                   bin->stack_head;
 	/* Wraparound handled as well. */
-	uint16_t diff = cache_bin_diff(bin, bin->low_bits_full, low_bits_head,
-	    /* racy */ false);
-	*(void **)((uintptr_t)bin->stack_head - diff) = ptr;
+	cache_bin_sz_t diff = cache_bin_diff(
+	    bin, bin->low_bits_full, low_bits_head);
+	*(void **)((byte_t *)bin->stack_head - diff) = ptr;

 	assert(!cache_bin_full(bin));
 	bin->low_bits_full += sizeof(void *);
@ -465,67 +533,101 @@ cache_bin_stash(cache_bin_t *bin, void *ptr) {
 	return true;
 }

-/*
- * Get the number of stashed pointers.
- *
- * When called from a thread not owning the TLS (i.e. racy = true), it's
- * important to keep in mind that 'bin->stack_head' and 'bin->low_bits_full' can
- * be modified concurrently and almost none assertions about their values can be
- * made.
- */
+/* Get the number of stashed pointers. */
 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
-cache_bin_nstashed_get_internal(cache_bin_t *bin, cache_bin_info_t *info,
-    bool racy) {
-	cache_bin_sz_t ncached_max = cache_bin_info_ncached_max(info);
-	uint16_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(bin,
-	    info);
+cache_bin_nstashed_get_internal(cache_bin_t *bin) {
+	cache_bin_sz_t ncached_max = cache_bin_ncached_max_get(bin);
+	cache_bin_sz_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(
+	    bin);

-	cache_bin_sz_t n = cache_bin_diff(bin, low_bits_low_bound,
-	    bin->low_bits_full, racy) / sizeof(void *);
+	cache_bin_sz_t n = cache_bin_diff(
+	                       bin, low_bits_low_bound, bin->low_bits_full)
+	    / sizeof(void *);
 	assert(n <= ncached_max);
-
-	if (!racy) {
+	if (config_debug && n != 0) {
 		/* Below are for assertions only. */
-		void **low_bound = cache_bin_low_bound_get(bin, info);
+		void **low_bound = cache_bin_low_bound_get(bin);

-		assert((uint16_t)(uintptr_t)low_bound == low_bits_low_bound);
+		assert(
+		    (cache_bin_sz_t)(uintptr_t)low_bound == low_bits_low_bound);
 		void *stashed = *(low_bound + n - 1);
-		bool aligned = cache_bin_nonfast_aligned(stashed);
+		bool  aligned = cache_bin_nonfast_aligned(stashed);
 #ifdef JEMALLOC_JET
 		/* Allow arbitrary pointers to be stashed in tests. */
 		aligned = true;
 #endif
-		assert(n == 0 || (stashed != NULL && aligned));
+		assert(stashed != NULL && aligned);
 	}

 	return n;
 }

 JEMALLOC_ALWAYS_INLINE cache_bin_sz_t
-cache_bin_nstashed_get_local(cache_bin_t *bin, cache_bin_info_t *info) {
-	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin, info,
-	    /* racy */ false);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_nstashed_get_local(cache_bin_t *bin) {
+	cache_bin_sz_t n = cache_bin_nstashed_get_internal(bin);
+	assert(n <= cache_bin_ncached_max_get(bin));
 	return n;
 }

 /*
 * Obtain a racy view of the number of items currently in the cache bin, in the
 * presence of possible concurrent modifications.
+ *
+ * Note that this is the only racy function in this header.  Any other functions
+ * are assumed to be non-racy.  The "racy" term here means accessed from another
+ * thread (that is not the owner of the specific cache bin).  This only happens
+ * when gathering stats (read-only).  The only change because of the racy
+ * condition is that assertions based on mutable fields are omitted.
+ *
+ * It's important to keep in mind that 'bin->stack_head' and
+ * 'bin->low_bits_full' can be modified concurrently and almost no assertions
+ * about their values can be made.
+ *
+ * This function should not call other utility functions because the racy
+ * condition may cause unexpected / undefined behaviors in unverified utility
+ * functions.  Currently, this function calls two utility functions
+ * cache_bin_ncached_max_get and cache_bin_low_bits_low_bound_get because
+ * they help access values that will not be concurrently modified.
 */
 static inline void
-cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) {
-	cache_bin_sz_t n = cache_bin_ncached_get_internal(bin, /* racy */ true);
-	assert(n <= cache_bin_info_ncached_max(info));
+cache_bin_nitems_get_remote(
+    cache_bin_t *bin, cache_bin_sz_t *ncached, cache_bin_sz_t *nstashed) {
+	/* Racy version of cache_bin_ncached_get_internal. */
+	cache_bin_sz_t diff = bin->low_bits_empty
+	    - (cache_bin_sz_t)(uintptr_t)bin->stack_head;
+	cache_bin_sz_t n = diff / sizeof(void *);
 	*ncached = n;

-	n = cache_bin_nstashed_get_internal(bin, info, /* racy */ true);
-	assert(n <= cache_bin_info_ncached_max(info));
+	/* Racy version of cache_bin_nstashed_get_internal. */
+	cache_bin_sz_t low_bits_low_bound = cache_bin_low_bits_low_bound_get(
+	    bin);
+	n = (bin->low_bits_full - low_bits_low_bound) / sizeof(void *);
 	*nstashed = n;
-	/* Note that cannot assert ncached + nstashed <= ncached_max (racy). */
+	/*
+	 * Note that cannot assert anything regarding ncached_max because
+	 * it can be configured on the fly and is thus racy.
+	 */
 }

+/*
+ * For small bins, used to calculate how many items to fill at a time.
+ * The final nfill is calculated by (ncached_max >> (base - offset)).
+ */
+typedef struct cache_bin_fill_ctl_s cache_bin_fill_ctl_t;
+struct cache_bin_fill_ctl_s {
+	uint8_t base;
+	uint8_t offset;
+};
+
+/*
+ * Limit how many items can be flushed in a batch (Which is the upper bound
+ * for the nflush parameter in tcache_bin_flush_impl()).
+ * This is to avoid stack overflow when we do batch edata look up, which
+ * reserves a nflush * sizeof(emap_batch_lookup_result_t) stack variable.
+ */
+#define CACHE_BIN_NFLUSH_BATCH_MAX                                             \
+	((VARIABLE_ARRAY_SIZE_MAX >> LG_SIZEOF_PTR) - 1)
+
 /*
 * Filling and flushing are done in batch, on arrays of void *s.  For filling,
 * the arrays go forward, and can be accessed with ordinary array arithmetic.
@ -546,7 +648,7 @@ cache_bin_nitems_get_remote(cache_bin_t *bin, cache_bin_info_t *info,
 typedef struct cache_bin_ptr_array_s cache_bin_ptr_array_t;
 struct cache_bin_ptr_array_s {
 	cache_bin_sz_t n;
-	void **ptr;
+	void         **ptr;
 };

 /*
@ -558,18 +660,18 @@ struct cache_bin_ptr_array_s {
 * representations is easy (since they'll require an alloca in the calling
 * frame).
 */
-#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval)				\
-    cache_bin_ptr_array_t name;						\
-    name.n = (nval)
+#define CACHE_BIN_PTR_ARRAY_DECLARE(name, nval)                                \
+	cache_bin_ptr_array_t name;                                            \
+	name.n = (nval)

 /*
 * Start a fill.  The bin must be empty, and This must be followed by a
 * finish_fill call before doing any alloc/dalloc operations on the bin.
 */
 static inline void
-cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
-	cache_bin_assert_empty(bin, info);
+cache_bin_init_ptr_array_for_fill(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfill) {
+	cache_bin_assert_empty(bin);
 	arr->ptr = cache_bin_empty_position_get(bin) - nfill;
 }

@ -579,15 +681,19 @@ cache_bin_init_ptr_array_for_fill(cache_bin_t *bin, cache_bin_info_t *info,
 * case of OOM.
 */
 static inline void
-cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
-	cache_bin_assert_empty(bin, info);
+cache_bin_finish_fill(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nfilled) {
+	cache_bin_assert_empty(bin);
 	void **empty_position = cache_bin_empty_position_get(bin);
 	if (nfilled < arr->n) {
 		memmove(empty_position - nfilled, empty_position - arr->n,
 		    nfilled * sizeof(void *));
 	}
 	bin->stack_head = empty_position - nfilled;
+	/* Reset the bin stats as it's merged during fill. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 /*
@ -595,55 +701,61 @@ cache_bin_finish_fill(cache_bin_t *bin, cache_bin_info_t *info,
 * everything we give them.
 */
 static inline void
-cache_bin_init_ptr_array_for_flush(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) {
+cache_bin_init_ptr_array_for_flush(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflush) {
 	arr->ptr = cache_bin_empty_position_get(bin) - nflush;
-	assert(cache_bin_ncached_get_local(bin, info) == 0
-	    || *arr->ptr != NULL);
+	assert(cache_bin_ncached_get_local(bin) == 0 || *arr->ptr != NULL);
 }

 static inline void
-cache_bin_finish_flush(cache_bin_t *bin, cache_bin_info_t *info,
-    cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
-	unsigned rem = cache_bin_ncached_get_local(bin, info) - nflushed;
-	memmove(bin->stack_head + nflushed, bin->stack_head,
-	    rem * sizeof(void *));
-	bin->stack_head = bin->stack_head + nflushed;
+cache_bin_finish_flush(
+    cache_bin_t *bin, cache_bin_ptr_array_t *arr, cache_bin_sz_t nflushed) {
+	unsigned rem = cache_bin_ncached_get_local(bin) - nflushed;
+	memmove(
+	    bin->stack_head + nflushed, bin->stack_head, rem * sizeof(void *));
+	bin->stack_head += nflushed;
 	cache_bin_low_water_adjust(bin);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 static inline void
 cache_bin_init_ptr_array_for_stashed(cache_bin_t *bin, szind_t binind,
-    cache_bin_info_t *info, cache_bin_ptr_array_t *arr,
-    cache_bin_sz_t nstashed) {
+    cache_bin_ptr_array_t *arr, cache_bin_sz_t nstashed) {
 	assert(nstashed > 0);
-	assert(cache_bin_nstashed_get_local(bin, info) == nstashed);
+	assert(cache_bin_nstashed_get_local(bin) == nstashed);

-	void **low_bound = cache_bin_low_bound_get(bin, info);
+	void **low_bound = cache_bin_low_bound_get(bin);
 	arr->ptr = low_bound;
 	assert(*arr->ptr != NULL);
 }

 static inline void
-cache_bin_finish_flush_stashed(cache_bin_t *bin, cache_bin_info_t *info) {
-	void **low_bound = cache_bin_low_bound_get(bin, info);
+cache_bin_finish_flush_stashed(cache_bin_t *bin) {
+	void **low_bound = cache_bin_low_bound_get(bin);

 	/* Reset the bin local full position. */
 	bin->low_bits_full = (uint16_t)(uintptr_t)low_bound;
-	assert(cache_bin_nstashed_get_local(bin, info) == 0);
+	assert(cache_bin_nstashed_get_local(bin) == 0);
+	/* Reset the bin stats as it's merged during flush. */
+	if (config_stats) {
+		bin->tstats.nrequests = 0;
+	}
 }

 /*
 * Initialize a cache_bin_info to represent up to the given number of items in
 * the cache_bins it is associated with.
 */
-void cache_bin_info_init(cache_bin_info_t *bin_info,
-    cache_bin_sz_t ncached_max);
+void cache_bin_info_init(
+    cache_bin_info_t *bin_info, cache_bin_sz_t ncached_max);
 /*
 * Given an array of initialized cache_bin_info_ts, determine how big an
 * allocation is required to initialize a full set of cache_bin_ts.
 */
-void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
+void cache_bin_info_compute_alloc(const cache_bin_info_t *infos, szind_t ninfos,
    size_t *size, size_t *alignment);

 /*
@ -653,18 +765,13 @@ void cache_bin_info_compute_alloc(cache_bin_info_t *infos, szind_t ninfos,
 * cache_bin_postincrement.  *alloc_cur will then point immediately past the end
 * of the allocation.
 */
-void cache_bin_preincrement(cache_bin_info_t *infos, szind_t ninfos,
+void cache_bin_preincrement(const cache_bin_info_t *infos, szind_t ninfos,
    void *alloc, size_t *cur_offset);
-void cache_bin_postincrement(cache_bin_info_t *infos, szind_t ninfos,
-    void *alloc, size_t *cur_offset);
-void cache_bin_init(cache_bin_t *bin, cache_bin_info_t *info, void *alloc,
+void cache_bin_postincrement(void *alloc, size_t *cur_offset);
+void cache_bin_init(cache_bin_t *bin, const cache_bin_info_t *info, void *alloc,
    size_t *cur_offset);
+void cache_bin_init_disabled(cache_bin_t *bin, cache_bin_sz_t ncached_max);

-/*
- * If a cache bin was zero initialized (either because it lives in static or
- * thread-local storage, or was memset to 0), this function indicates whether or
- * not cache_bin_init was called on it.
- */
-bool cache_bin_still_zero_initialized(cache_bin_t *bin);
+bool cache_bin_stack_use_thp(void);

 #endif /* JEMALLOC_INTERNAL_CACHE_BIN_H */
--- a/include/jemalloc/internal/ckh.h
+++ b/include/jemalloc/internal/ckh.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_CKH_H
 #define JEMALLOC_INTERNAL_CKH_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/tsd.h"

 /* Cuckoo hashing implementation.  Skip to the end for the interface. */
@ -21,8 +22,8 @@
 #define LG_CKH_BUCKET_CELLS (LG_CACHELINE - LG_SIZEOF_PTR - 1)

 /* Typedefs to allow easy function pointer passing. */
-typedef void ckh_hash_t (const void *, size_t[2]);
-typedef bool ckh_keycomp_t (const void *, const void *);
+typedef void ckh_hash_t(const void *, size_t[2]);
+typedef bool ckh_keycomp_t(const void *, const void *);

 /* Hash table cell. */
 typedef struct {
@ -55,7 +56,7 @@ typedef struct {
 	unsigned lg_curbuckets;

 	/* Hash and comparison functions. */
-	ckh_hash_t *hash;
+	ckh_hash_t    *hash;
 	ckh_keycomp_t *keycomp;

 	/* Hash table with 2^lg_curbuckets buckets. */
@ -88,8 +89,8 @@ bool ckh_iter(ckh_t *ckh, size_t *tabind, void **key, void **data);
 * the key and value, and doesn't do any lifetime management.
 */
 bool ckh_insert(tsd_t *tsd, ckh_t *ckh, const void *key, const void *data);
-bool ckh_remove(tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key,
-    void **data);
+bool ckh_remove(
+    tsd_t *tsd, ckh_t *ckh, const void *searchkey, void **key, void **data);
 bool ckh_search(ckh_t *ckh, const void *searchkey, void **key, void **data);

 /* Some useful hash and comparison functions for strings and pointers. */
--- a/include/jemalloc/internal/conf.h
+++ b/include/jemalloc/internal/conf.h
@ -0,0 +1,23 @@
+#ifndef JEMALLOC_INTERNAL_CONF_H
+#define JEMALLOC_INTERNAL_CONF_H
+
+#include "jemalloc/internal/sc.h"
+
+void malloc_conf_init(sc_data_t *sc_data, unsigned bin_shard_sizes[SC_NBINS],
+    char readlink_buf[PATH_MAX + 1]);
+void malloc_abort_invalid_conf(void);
+
+#ifdef JEMALLOC_JET
+extern bool had_conf_error;
+
+bool conf_next(char const **opts_p, char const **k_p, size_t *klen_p,
+    char const **v_p, size_t *vlen_p);
+void conf_error(
+    const char *msg, const char *k, size_t klen, const char *v, size_t vlen);
+bool conf_handle_bool(const char *v, size_t vlen, bool *result);
+bool conf_handle_signed(const char *v, size_t vlen, intmax_t min, intmax_t max,
+    bool check_min, bool check_max, bool clip, intmax_t *result);
+bool conf_handle_char_p(const char *v, size_t vlen, char *dest, size_t dest_sz);
+#endif
+
+#endif /* JEMALLOC_INTERNAL_CONF_H */
--- a/include/jemalloc/internal/counter.h
+++ b/include/jemalloc/internal/counter.h
@ -1,12 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_COUNTER_H
 #define JEMALLOC_INTERNAL_COUNTER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/lockedint.h"
 #include "jemalloc/internal/mutex.h"

 typedef struct counter_accum_s {
 	LOCKEDINT_MTX_DECLARE(mtx)
 	locked_u64_t accumbytes;
-	uint64_t interval;
+	uint64_t     interval;
 } counter_accum_t;

 JEMALLOC_ALWAYS_INLINE bool
--- a/include/jemalloc/internal/ctl.h
+++ b/include/jemalloc/internal/ctl.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_CTL_H
 #define JEMALLOC_INTERNAL_CTL_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_stats.h"
+#include "jemalloc/internal/background_thread_structs.h"
+#include "jemalloc/internal/bin_stats.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex_prof.h"
@ -9,50 +13,52 @@
 #include "jemalloc/internal/stats.h"

 /* Maximum ctl tree depth. */
-#define CTL_MAX_DEPTH	7
+#define CTL_MAX_DEPTH 7
+#define CTL_MULTI_SETTING_MAX_LEN 1000

 typedef struct ctl_node_s {
 	bool named;
 } ctl_node_t;

 typedef struct ctl_named_node_s {
-	ctl_node_t node;
+	ctl_node_t  node;
 	const char *name;
 	/* If (nchildren == 0), this is a terminal node. */
-	size_t nchildren;
+	size_t            nchildren;
 	const ctl_node_t *children;
-	int (*ctl)(tsd_t *, const size_t *, size_t, void *, size_t *, void *,
-	    size_t);
+	int (*ctl)(
+	    tsd_t *, const size_t *, size_t, void *, size_t *, void *, size_t);
 } ctl_named_node_t;

 typedef struct ctl_indexed_node_s {
 	struct ctl_node_s node;
-	const ctl_named_node_t *(*index)(tsdn_t *, const size_t *, size_t,
-	    size_t);
+	const ctl_named_node_t *(*index)(
+	    tsdn_t *, const size_t *, size_t, size_t);
 } ctl_indexed_node_t;

 typedef struct ctl_arena_stats_s {
 	arena_stats_t astats;

 	/* Aggregate stats for small size classes, based on bin stats. */
-	size_t allocated_small;
+	size_t   allocated_small;
 	uint64_t nmalloc_small;
 	uint64_t ndalloc_small;
 	uint64_t nrequests_small;
 	uint64_t nfills_small;
 	uint64_t nflushes_small;

-	bin_stats_data_t bstats[SC_NBINS];
+	bin_stats_data_t    bstats[SC_NBINS];
 	arena_stats_large_t lstats[SC_NSIZES - SC_NBINS];
-	pac_estats_t estats[SC_NPSIZES];
-	hpa_shard_stats_t hpastats;
-	sec_stats_t secstats;
+	pac_estats_t        estats[SC_NPSIZES];
+	hpa_shard_stats_t   hpastats;
 } ctl_arena_stats_t;

 typedef struct ctl_stats_s {
 	size_t allocated;
 	size_t active;
 	size_t metadata;
+	size_t metadata_edata;
+	size_t metadata_rtree;
 	size_t metadata_thp;
 	size_t resident;
 	size_t mapped;
@ -65,17 +71,17 @@ typedef struct ctl_stats_s {
 typedef struct ctl_arena_s ctl_arena_t;
 struct ctl_arena_s {
 	unsigned arena_ind;
-	bool initialized;
+	bool     initialized;
 	ql_elm(ctl_arena_t) destroyed_link;

 	/* Basic stats, supported even if !config_stats. */
-	unsigned nthreads;
+	unsigned    nthreads;
 	const char *dss;
-	ssize_t dirty_decay_ms;
-	ssize_t muzzy_decay_ms;
-	size_t pactive;
-	size_t pdirty;
-	size_t pmuzzy;
+	ssize_t     dirty_decay_ms;
+	ssize_t     muzzy_decay_ms;
+	size_t      pactive;
+	size_t      pdirty;
+	size_t      pmuzzy;

 	/* NULL if !config_stats. */
 	ctl_arena_stats_t *astats;
@ -100,60 +106,67 @@ int ctl_byname(tsd_t *tsd, const char *name, void *oldp, size_t *oldlenp,
 int ctl_nametomib(tsd_t *tsd, const char *name, size_t *mibp, size_t *miblenp);
 int ctl_bymib(tsd_t *tsd, const size_t *mib, size_t miblen, void *oldp,
    size_t *oldlenp, void *newp, size_t newlen);
-int ctl_mibnametomib(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
-    size_t *miblenp);
-int ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
-    size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
+int ctl_mibnametomib(
+    tsd_t *tsd, size_t *mib, size_t miblen, const char *name, size_t *miblenp);
+int  ctl_bymibname(tsd_t *tsd, size_t *mib, size_t miblen, const char *name,
+     size_t *miblenp, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
 bool ctl_boot(void);
 void ctl_prefork(tsdn_t *tsdn);
 void ctl_postfork_parent(tsdn_t *tsdn);
 void ctl_postfork_child(tsdn_t *tsdn);
 void ctl_mtx_assert_held(tsdn_t *tsdn);

-#define xmallctl(name, oldp, oldlenp, newp, newlen) do {		\
-	if (je_mallctl(name, oldp, oldlenp, newp, newlen)		\
-	    != 0) {							\
-		malloc_printf(						\
-		    "<jemalloc>: Failure in xmallctl(\"%s\", ...)\n",	\
-		    name);						\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctl(name, oldp, oldlenp, newp, newlen)                            \
+	do {                                                                   \
+		if (je_mallctl(name, oldp, oldlenp, newp, newlen) != 0) {      \
+			malloc_printf(                                         \
+			    "<jemalloc>: Failure in xmallctl(\"%s\", ...)\n",  \
+			    name);                                             \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define xmallctlnametomib(name, mibp, miblenp) do {			\
-	if (je_mallctlnametomib(name, mibp, miblenp) != 0) {		\
-		malloc_printf("<jemalloc>: Failure in "			\
-		    "xmallctlnametomib(\"%s\", ...)\n", name);		\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlnametomib(name, mibp, miblenp)                                 \
+	do {                                                                   \
+		if (je_mallctlnametomib(name, mibp, miblenp) != 0) {           \
+			malloc_printf(                                         \
+			    "<jemalloc>: Failure in "                          \
+			    "xmallctlnametomib(\"%s\", ...)\n",                \
+			    name);                                             \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen) do {	\
-	if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp,		\
-	    newlen) != 0) {						\
-		malloc_write(						\
-		    "<jemalloc>: Failure in xmallctlbymib()\n");	\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen)                \
+	do {                                                                   \
+		if (je_mallctlbymib(mib, miblen, oldp, oldlenp, newp, newlen)  \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in xmallctlbymib()\n");       \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define xmallctlmibnametomib(mib, miblen, name, miblenp) do {		\
-	if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp)	\
-	    != 0) {							\
-		malloc_write(						\
-		    "<jemalloc>: Failure in ctl_mibnametomib()\n");	\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlmibnametomib(mib, miblen, name, miblenp)                       \
+	do {                                                                   \
+		if (ctl_mibnametomib(tsd_fetch(), mib, miblen, name, miblenp)  \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in ctl_mibnametomib()\n");    \
+			abort();                                               \
+		}                                                              \
+	} while (0)

-#define xmallctlbymibname(mib, miblen, name, miblenp, oldp, oldlenp,	\
-    newp, newlen) do {							\
-	if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp,	\
-	    oldp, oldlenp, newp, newlen) != 0) {			\
-		malloc_write(						\
-		    "<jemalloc>: Failure in ctl_bymibname()\n");	\
-		abort();						\
-	}								\
-} while (0)
+#define xmallctlbymibname(                                                     \
+    mib, miblen, name, miblenp, oldp, oldlenp, newp, newlen)                   \
+	do {                                                                   \
+		if (ctl_bymibname(tsd_fetch(), mib, miblen, name, miblenp,     \
+		        oldp, oldlenp, newp, newlen)                           \
+		    != 0) {                                                    \
+			malloc_write(                                          \
+			    "<jemalloc>: Failure in ctl_bymibname()\n");       \
+			abort();                                               \
+		}                                                              \
+	} while (0)

 #endif /* JEMALLOC_INTERNAL_CTL_H */
--- a/include/jemalloc/internal/decay.h
+++ b/include/jemalloc/internal/decay.h
@ -1,9 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_DECAY_H
 #define JEMALLOC_INTERNAL_DECAY_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/smoothstep.h"

-#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t)-1)
+#define DECAY_UNBOUNDED_TIME_TO_PURGE ((uint64_t) - 1)

 /*
 * The decay_t computes the number of pages we should purge at any given time.
@ -166,12 +168,12 @@ void decay_reinit(decay_t *decay, nstime_t *cur_time, ssize_t decay_ms);
 /*
 * Compute how many of 'npages_new' pages we would need to purge in 'time'.
 */
-uint64_t decay_npages_purge_in(decay_t *decay, nstime_t *time,
-    size_t npages_new);
+uint64_t decay_npages_purge_in(
+    decay_t *decay, nstime_t *time, size_t npages_new);

 /* Returns true if the epoch advanced and there are pages to purge. */
-bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time,
-    size_t current_npages);
+bool decay_maybe_advance_epoch(
+    decay_t *decay, nstime_t *new_time, size_t current_npages);

 /*
 * Calculates wait time until a number of pages in the interval
@ -180,7 +182,7 @@ bool decay_maybe_advance_epoch(decay_t *decay, nstime_t *new_time,
 * Returns number of nanoseconds or DECAY_UNBOUNDED_TIME_TO_PURGE in case of
 * indefinite wait.
 */
-uint64_t decay_ns_until_purge(decay_t *decay, size_t npages_current,
-    uint64_t npages_threshold);
+uint64_t decay_ns_until_purge(
+    decay_t *decay, size_t npages_current, uint64_t npages_threshold);

 #endif /* JEMALLOC_INTERNAL_DECAY_H */
--- a/include/jemalloc/internal/div.h
+++ b/include/jemalloc/internal/div.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_DIV_H
 #define JEMALLOC_INTERNAL_DIV_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /*
--- a/include/jemalloc/internal/ecache.h
+++ b/include/jemalloc/internal/ecache.h
@ -1,15 +1,16 @@
 #ifndef JEMALLOC_INTERNAL_ECACHE_H
 #define JEMALLOC_INTERNAL_ECACHE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/eset.h"
-#include "jemalloc/internal/san.h"
 #include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/san.h"

 typedef struct ecache_s ecache_t;
 struct ecache_s {
 	malloc_mutex_t mtx;
-	eset_t eset;
-	eset_t guarded_eset;
+	eset_t         eset;
+	eset_t         guarded_eset;
 	/* All stored extents must be in the same state. */
 	extent_state_t state;
 	/* The index of the ehooks the ecache is associated with. */
@ -23,22 +24,22 @@ struct ecache_s {

 static inline size_t
 ecache_npages_get(ecache_t *ecache) {
-	return eset_npages_get(&ecache->eset) +
-	    eset_npages_get(&ecache->guarded_eset);
+	return eset_npages_get(&ecache->eset)
+	    + eset_npages_get(&ecache->guarded_eset);
 }

 /* Get the number of extents in the given page size index. */
 static inline size_t
 ecache_nextents_get(ecache_t *ecache, pszind_t ind) {
-	return eset_nextents_get(&ecache->eset, ind) +
-	    eset_nextents_get(&ecache->guarded_eset, ind);
+	return eset_nextents_get(&ecache->eset, ind)
+	    + eset_nextents_get(&ecache->guarded_eset, ind);
 }

 /* Get the sum total bytes of the extents in the given page size index. */
 static inline size_t
 ecache_nbytes_get(ecache_t *ecache, pszind_t ind) {
-	return eset_nbytes_get(&ecache->eset, ind) +
-	    eset_nbytes_get(&ecache->guarded_eset, ind);
+	return eset_nbytes_get(&ecache->eset, ind)
+	    + eset_nbytes_get(&ecache->guarded_eset, ind);
 }

 static inline unsigned
--- a/include/jemalloc/internal/edata.h
+++ b/include/jemalloc/internal/edata.h
@ -1,12 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_EDATA_H
 #define JEMALLOC_INTERNAL_EDATA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bin_info.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/hpdata.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/ph.h"
+#include "jemalloc/internal/prof_types.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/sc.h"
 #include "jemalloc/internal/slab_data.h"
@ -19,10 +21,18 @@
 */
 #define EDATA_ALIGNMENT 128

+/*
+ * Defines how many nodes visited when enumerating the heap to search for
+ * qualified extents.  More nodes visited may result in better choices at
+ * the cost of longer search time.  This size should not exceed 2^16 - 1
+ * because we use uint16_t for accessing the queue needed for enumeration.
+ */
+#define ESET_ENUMERATE_MAX_NUM 32
+
 enum extent_state_e {
-	extent_state_active   = 0,
-	extent_state_dirty    = 1,
-	extent_state_muzzy    = 2,
+	extent_state_active = 0,
+	extent_state_dirty = 1,
+	extent_state_muzzy = 2,
 	extent_state_retained = 3,
 	extent_state_transition = 4, /* States below are intermediate. */
 	extent_state_merging = 5,
@ -32,7 +42,7 @@ typedef enum extent_state_e extent_state_t;

 enum extent_head_state_e {
 	EXTENT_NOT_HEAD,
-	EXTENT_IS_HEAD   /* See comments in ehooks_default_merge_impl(). */
+	EXTENT_IS_HEAD /* See comments in ehooks_default_merge_impl(). */
 };
 typedef enum extent_head_state_e extent_head_state_t;

@ -40,25 +50,22 @@ typedef enum extent_head_state_e extent_head_state_t;
 * Which implementation of the page allocator interface, (PAI, defined in
 * pai.h) owns the given extent?
 */
-enum extent_pai_e {
-	EXTENT_PAI_PAC = 0,
-	EXTENT_PAI_HPA = 1
-};
+enum extent_pai_e { EXTENT_PAI_PAC = 0, EXTENT_PAI_HPA = 1 };
 typedef enum extent_pai_e extent_pai_t;

 struct e_prof_info_s {
 	/* Time when this was allocated. */
-	nstime_t	e_prof_alloc_time;
+	nstime_t e_prof_alloc_time;
 	/* Allocation request size. */
-	size_t		e_prof_alloc_size;
+	size_t e_prof_alloc_size;
 	/* Points to a prof_tctx_t. */
-	atomic_p_t	e_prof_tctx;
+	atomic_p_t e_prof_tctx;
 	/*
 	 * Points to a prof_recent_t for the allocation; NULL
 	 * means the recent allocation record no longer exists.
 	 * Protected by prof_recent_alloc_mtx.
 	 */
-	atomic_p_t	e_prof_recent_alloc;
+	atomic_p_t e_prof_recent_alloc;
 };
 typedef struct e_prof_info_s e_prof_info_t;

@ -75,20 +82,20 @@ typedef struct e_prof_info_s e_prof_info_t;
 */
 typedef struct edata_map_info_s edata_map_info_t;
 struct edata_map_info_s {
-	bool slab;
+	bool    slab;
 	szind_t szind;
 };

 typedef struct edata_cmp_summary_s edata_cmp_summary_t;
 struct edata_cmp_summary_s {
-	uint64_t sn;
+	uint64_t  sn;
 	uintptr_t addr;
 };

 /* Extent (span of pages).  Use accessor functions for e_* fields. */
 typedef struct edata_s edata_t;
-ph_structs(edata_avail, edata_t);
-ph_structs(edata_heap, edata_t);
+ph_structs(edata_avail, edata_t, ESET_ENUMERATE_MAX_NUM);
+ph_structs(edata_heap, edata_t, ESET_ENUMERATE_MAX_NUM);
 struct edata_s {
 	/*
 	 * Bitfield containing several fields:
@ -139,55 +146,72 @@ struct edata_s {
 	 *
 	 * bin_shard: the shard of the bin from which this extent came.
 	 */
-	uint64_t		e_bits;
-#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT) ((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1)) << (CURRENT_FIELD_SHIFT))
+	uint64_t e_bits;
+#define MASK(CURRENT_FIELD_WIDTH, CURRENT_FIELD_SHIFT)                         \
+	((((((uint64_t)0x1U) << (CURRENT_FIELD_WIDTH)) - 1))                   \
+	    << (CURRENT_FIELD_SHIFT))

-#define EDATA_BITS_ARENA_WIDTH  MALLOCX_ARENA_BITS
-#define EDATA_BITS_ARENA_SHIFT  0
-#define EDATA_BITS_ARENA_MASK  MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT)
+#define EDATA_BITS_ARENA_WIDTH MALLOCX_ARENA_BITS
+#define EDATA_BITS_ARENA_SHIFT 0
+#define EDATA_BITS_ARENA_MASK                                                  \
+	MASK(EDATA_BITS_ARENA_WIDTH, EDATA_BITS_ARENA_SHIFT)

-#define EDATA_BITS_SLAB_WIDTH  1
-#define EDATA_BITS_SLAB_SHIFT  (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT)
-#define EDATA_BITS_SLAB_MASK  MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT)
+#define EDATA_BITS_SLAB_WIDTH 1
+#define EDATA_BITS_SLAB_SHIFT (EDATA_BITS_ARENA_WIDTH + EDATA_BITS_ARENA_SHIFT)
+#define EDATA_BITS_SLAB_MASK MASK(EDATA_BITS_SLAB_WIDTH, EDATA_BITS_SLAB_SHIFT)

-#define EDATA_BITS_COMMITTED_WIDTH  1
-#define EDATA_BITS_COMMITTED_SHIFT  (EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT)
-#define EDATA_BITS_COMMITTED_MASK  MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT)
+#define EDATA_BITS_COMMITTED_WIDTH 1
+#define EDATA_BITS_COMMITTED_SHIFT                                             \
+	(EDATA_BITS_SLAB_WIDTH + EDATA_BITS_SLAB_SHIFT)
+#define EDATA_BITS_COMMITTED_MASK                                              \
+	MASK(EDATA_BITS_COMMITTED_WIDTH, EDATA_BITS_COMMITTED_SHIFT)

-#define EDATA_BITS_PAI_WIDTH  1
-#define EDATA_BITS_PAI_SHIFT  (EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT)
-#define EDATA_BITS_PAI_MASK  MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT)
+#define EDATA_BITS_PAI_WIDTH 1
+#define EDATA_BITS_PAI_SHIFT                                                   \
+	(EDATA_BITS_COMMITTED_WIDTH + EDATA_BITS_COMMITTED_SHIFT)
+#define EDATA_BITS_PAI_MASK MASK(EDATA_BITS_PAI_WIDTH, EDATA_BITS_PAI_SHIFT)

-#define EDATA_BITS_ZEROED_WIDTH  1
-#define EDATA_BITS_ZEROED_SHIFT  (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT)
-#define EDATA_BITS_ZEROED_MASK  MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT)
+#define EDATA_BITS_ZEROED_WIDTH 1
+#define EDATA_BITS_ZEROED_SHIFT (EDATA_BITS_PAI_WIDTH + EDATA_BITS_PAI_SHIFT)
+#define EDATA_BITS_ZEROED_MASK                                                 \
+	MASK(EDATA_BITS_ZEROED_WIDTH, EDATA_BITS_ZEROED_SHIFT)

-#define EDATA_BITS_GUARDED_WIDTH  1
-#define EDATA_BITS_GUARDED_SHIFT  (EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT)
-#define EDATA_BITS_GUARDED_MASK  MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT)
+#define EDATA_BITS_GUARDED_WIDTH 1
+#define EDATA_BITS_GUARDED_SHIFT                                               \
+	(EDATA_BITS_ZEROED_WIDTH + EDATA_BITS_ZEROED_SHIFT)
+#define EDATA_BITS_GUARDED_MASK                                                \
+	MASK(EDATA_BITS_GUARDED_WIDTH, EDATA_BITS_GUARDED_SHIFT)

-#define EDATA_BITS_STATE_WIDTH  3
-#define EDATA_BITS_STATE_SHIFT  (EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT)
-#define EDATA_BITS_STATE_MASK  MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT)
+#define EDATA_BITS_STATE_WIDTH 3
+#define EDATA_BITS_STATE_SHIFT                                                 \
+	(EDATA_BITS_GUARDED_WIDTH + EDATA_BITS_GUARDED_SHIFT)
+#define EDATA_BITS_STATE_MASK                                                  \
+	MASK(EDATA_BITS_STATE_WIDTH, EDATA_BITS_STATE_SHIFT)

-#define EDATA_BITS_SZIND_WIDTH  LG_CEIL(SC_NSIZES)
-#define EDATA_BITS_SZIND_SHIFT  (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT)
-#define EDATA_BITS_SZIND_MASK  MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT)
+#define EDATA_BITS_SZIND_WIDTH LG_CEIL(SC_NSIZES)
+#define EDATA_BITS_SZIND_SHIFT (EDATA_BITS_STATE_WIDTH + EDATA_BITS_STATE_SHIFT)
+#define EDATA_BITS_SZIND_MASK                                                  \
+	MASK(EDATA_BITS_SZIND_WIDTH, EDATA_BITS_SZIND_SHIFT)

-#define EDATA_BITS_NFREE_WIDTH  (SC_LG_SLAB_MAXREGS + 1)
-#define EDATA_BITS_NFREE_SHIFT  (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT)
-#define EDATA_BITS_NFREE_MASK  MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT)
+#define EDATA_BITS_NFREE_WIDTH (SC_LG_SLAB_MAXREGS + 1)
+#define EDATA_BITS_NFREE_SHIFT (EDATA_BITS_SZIND_WIDTH + EDATA_BITS_SZIND_SHIFT)
+#define EDATA_BITS_NFREE_MASK                                                  \
+	MASK(EDATA_BITS_NFREE_WIDTH, EDATA_BITS_NFREE_SHIFT)

-#define EDATA_BITS_BINSHARD_WIDTH  6
-#define EDATA_BITS_BINSHARD_SHIFT  (EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT)
-#define EDATA_BITS_BINSHARD_MASK  MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT)
+#define EDATA_BITS_BINSHARD_WIDTH 6
+#define EDATA_BITS_BINSHARD_SHIFT                                              \
+	(EDATA_BITS_NFREE_WIDTH + EDATA_BITS_NFREE_SHIFT)
+#define EDATA_BITS_BINSHARD_MASK                                               \
+	MASK(EDATA_BITS_BINSHARD_WIDTH, EDATA_BITS_BINSHARD_SHIFT)

 #define EDATA_BITS_IS_HEAD_WIDTH 1
-#define EDATA_BITS_IS_HEAD_SHIFT  (EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT)
-#define EDATA_BITS_IS_HEAD_MASK  MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT)
+#define EDATA_BITS_IS_HEAD_SHIFT                                               \
+	(EDATA_BITS_BINSHARD_WIDTH + EDATA_BITS_BINSHARD_SHIFT)
+#define EDATA_BITS_IS_HEAD_MASK                                                \
+	MASK(EDATA_BITS_IS_HEAD_WIDTH, EDATA_BITS_IS_HEAD_SHIFT)

 	/* Pointer to the extent that this structure is responsible for. */
-	void			*e_addr;
+	void *e_addr;

 	union {
 		/*
@ -197,16 +221,16 @@ struct edata_s {
 		 *
 		 * ssssssss [...] ssssssss ssssnnnn nnnnnnnn
 		 */
-		size_t			e_size_esn;
-	#define EDATA_SIZE_MASK	((size_t)~(PAGE-1))
-	#define EDATA_ESN_MASK		((size_t)PAGE-1)
+		size_t e_size_esn;
+#define EDATA_SIZE_MASK ((size_t) ~(PAGE - 1))
+#define EDATA_ESN_MASK ((size_t)PAGE - 1)
 		/* Base extent size, which may not be a multiple of PAGE. */
-		size_t			e_bsize;
+		size_t e_bsize;
 	};

 	/*
 	 * If this edata is a user allocation from an HPA, it comes out of some
-	 * pageslab (we don't yet support huegpage allocations that don't fit
+	 * pageslab (we don't yet support hugepage allocations that don't fit
 	 * into pageslabs).  This tracks it.
 	 */
 	hpdata_t *e_ps;
@ -222,7 +246,7 @@ struct edata_s {
 		 * List linkage used when the edata_t is active; either in
 		 * arena's large allocations or bin_t's slabs_full.
 		 */
-		ql_elm(edata_t)	ql_link_active;
+		ql_elm(edata_t) ql_link_active;
 		/*
 		 * Pairing heap linkage.  Used whenever the extent is inactive
 		 * (in the page allocators), or when it is active and in
@ -230,7 +254,7 @@ struct edata_s {
 		 * extent and sitting in an edata_cache.
 		 */
 		union {
-			edata_heap_link_t heap_link;
+			edata_heap_link_t  heap_link;
 			edata_avail_link_t avail_link;
 		};
 	};
@ -243,10 +267,10 @@ struct edata_s {
 		 */
 		ql_elm(edata_t) ql_link_inactive;
 		/* Small region slab metadata. */
-		slab_data_t	e_slab_data;
+		slab_data_t e_slab_data;

 		/* Profiling data, used for large objects. */
-		e_prof_info_t	e_prof_info;
+		e_prof_info_t e_prof_info;
 	};
 };

@ -255,8 +279,8 @@ TYPED_LIST(edata_list_inactive, edata_t, ql_link_inactive)

 static inline unsigned
 edata_arena_ind_get(const edata_t *edata) {
-	unsigned arena_ind = (unsigned)((edata->e_bits &
-	    EDATA_BITS_ARENA_MASK) >> EDATA_BITS_ARENA_SHIFT);
+	unsigned arena_ind = (unsigned)((edata->e_bits & EDATA_BITS_ARENA_MASK)
+	    >> EDATA_BITS_ARENA_SHIFT);
 	assert(arena_ind < MALLOCX_ARENA_LIMIT);

 	return arena_ind;
@ -264,8 +288,8 @@ edata_arena_ind_get(const edata_t *edata) {

 static inline szind_t
 edata_szind_get_maybe_invalid(const edata_t *edata) {
-	szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK) >>
-	    EDATA_BITS_SZIND_SHIFT);
+	szind_t szind = (szind_t)((edata->e_bits & EDATA_BITS_SZIND_MASK)
+	    >> EDATA_BITS_SZIND_SHIFT);
 	assert(szind <= SC_NSIZES);
 	return szind;
 }
@ -279,13 +303,61 @@ edata_szind_get(const edata_t *edata) {

 static inline size_t
 edata_usize_get(const edata_t *edata) {
-	return sz_index2size(edata_szind_get(edata));
+	assert(edata != NULL);
+	/*
+	 * When sz_large_size_classes_disabled() is true, two cases:
+	 * 1. if usize_from_ind is not smaller than SC_LARGE_MINCLASS,
+	 * usize_from_size is accurate;
+	 * 2. otherwise, usize_from_ind is accurate.
+	 *
+	 * When sz_large_size_classes_disabled() is not true, the two should be the
+	 * same when usize_from_ind is not smaller than SC_LARGE_MINCLASS.
+	 *
+	 * Note sampled small allocs will be promoted.  Their extent size is
+	 * recorded in edata_size_get(edata), while their szind reflects the
+	 * true usize.  Thus, usize retrieved here is still accurate for
+	 * sampled small allocs.
+	 */
+	szind_t szind = edata_szind_get(edata);
+#ifdef JEMALLOC_JET
+	/*
+	 * Double free is invalid and results in undefined behavior.  However,
+	 * for double free tests to end gracefully, return an invalid usize
+	 * when szind shows the edata is not active, i.e., szind == SC_NSIZES.
+	 */
+	if (unlikely(szind == SC_NSIZES)) {
+		return SC_LARGE_MAXCLASS + 1;
+	}
+#endif
+
+	if (!sz_large_size_classes_disabled() || szind < SC_NBINS) {
+		size_t usize_from_ind = sz_index2size(szind);
+		if (!sz_large_size_classes_disabled()
+		    && usize_from_ind >= SC_LARGE_MINCLASS) {
+			size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+			assert(size > sz_large_pad);
+			size_t usize_from_size = size - sz_large_pad;
+			assert(usize_from_ind == usize_from_size);
+		}
+		return usize_from_ind;
+	}
+
+	size_t size = (edata->e_size_esn & EDATA_SIZE_MASK);
+	assert(size > sz_large_pad);
+	size_t usize_from_size = size - sz_large_pad;
+	/*
+	 * no matter large size classes disabled or not, usize retrieved from
+	 * size is not accurate when smaller than SC_LARGE_MINCLASS.
+	 */
+	assert(usize_from_size >= SC_LARGE_MINCLASS);
+	return usize_from_size;
 }

 static inline unsigned
 edata_binshard_get(const edata_t *edata) {
-	unsigned binshard = (unsigned)((edata->e_bits &
-	    EDATA_BITS_BINSHARD_MASK) >> EDATA_BITS_BINSHARD_SHIFT);
+	unsigned binshard = (unsigned)((edata->e_bits
+	                                   & EDATA_BITS_BINSHARD_MASK)
+	    >> EDATA_BITS_BINSHARD_SHIFT);
 	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
 	return binshard;
 }
@ -297,58 +369,58 @@ edata_sn_get(const edata_t *edata) {

 static inline extent_state_t
 edata_state_get(const edata_t *edata) {
-	return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK) >>
-	    EDATA_BITS_STATE_SHIFT);
+	return (extent_state_t)((edata->e_bits & EDATA_BITS_STATE_MASK)
+	    >> EDATA_BITS_STATE_SHIFT);
 }

 static inline bool
 edata_guarded_get(const edata_t *edata) {
-	return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK) >>
-	    EDATA_BITS_GUARDED_SHIFT);
+	return (bool)((edata->e_bits & EDATA_BITS_GUARDED_MASK)
+	    >> EDATA_BITS_GUARDED_SHIFT);
 }

 static inline bool
 edata_zeroed_get(const edata_t *edata) {
-	return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK) >>
-	    EDATA_BITS_ZEROED_SHIFT);
+	return (bool)((edata->e_bits & EDATA_BITS_ZEROED_MASK)
+	    >> EDATA_BITS_ZEROED_SHIFT);
 }

 static inline bool
 edata_committed_get(const edata_t *edata) {
-	return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK) >>
-	    EDATA_BITS_COMMITTED_SHIFT);
+	return (bool)((edata->e_bits & EDATA_BITS_COMMITTED_MASK)
+	    >> EDATA_BITS_COMMITTED_SHIFT);
 }

 static inline extent_pai_t
 edata_pai_get(const edata_t *edata) {
-	return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK) >>
-	    EDATA_BITS_PAI_SHIFT);
+	return (extent_pai_t)((edata->e_bits & EDATA_BITS_PAI_MASK)
+	    >> EDATA_BITS_PAI_SHIFT);
 }

 static inline bool
 edata_slab_get(const edata_t *edata) {
-	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK) >>
-	    EDATA_BITS_SLAB_SHIFT);
+	return (bool)((edata->e_bits & EDATA_BITS_SLAB_MASK)
+	    >> EDATA_BITS_SLAB_SHIFT);
 }

 static inline unsigned
 edata_nfree_get(const edata_t *edata) {
 	assert(edata_slab_get(edata));
-	return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK) >>
-	    EDATA_BITS_NFREE_SHIFT);
+	return (unsigned)((edata->e_bits & EDATA_BITS_NFREE_MASK)
+	    >> EDATA_BITS_NFREE_SHIFT);
 }

 static inline void *
 edata_base_get(const edata_t *edata) {
-	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) ||
-	    !edata_slab_get(edata));
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr)
+	    || !edata_slab_get(edata));
 	return PAGE_ADDR2BASE(edata->e_addr);
 }

 static inline void *
 edata_addr_get(const edata_t *edata) {
-	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr) ||
-	    !edata_slab_get(edata));
+	assert(edata->e_addr == PAGE_ADDR2BASE(edata->e_addr)
+	    || !edata_slab_get(edata));
 	return edata->e_addr;
 }

@ -375,19 +447,19 @@ edata_ps_get(const edata_t *edata) {

 static inline void *
 edata_before_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) - PAGE);
+	return (void *)((byte_t *)edata_base_get(edata) - PAGE);
 }

 static inline void *
 edata_last_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) +
-	    edata_size_get(edata) - PAGE);
+	return (void *)((byte_t *)edata_base_get(edata) + edata_size_get(edata)
+	    - PAGE);
 }

 static inline void *
 edata_past_get(const edata_t *edata) {
-	return (void *)((uintptr_t)edata_base_get(edata) +
-	    edata_size_get(edata));
+	return (
+	    void *)((byte_t *)edata_base_get(edata) + edata_size_get(edata));
 }

 static inline slab_data_t *
@ -404,8 +476,8 @@ edata_slab_data_get_const(const edata_t *edata) {

 static inline prof_tctx_t *
 edata_prof_tctx_get(const edata_t *edata) {
-	return (prof_tctx_t *)atomic_load_p(&edata->e_prof_info.e_prof_tctx,
-	    ATOMIC_ACQUIRE);
+	return (prof_tctx_t *)atomic_load_p(
+	    &edata->e_prof_info.e_prof_tctx, ATOMIC_ACQUIRE);
 }

 static inline const nstime_t *
@ -426,16 +498,16 @@ edata_prof_recent_alloc_get_dont_call_directly(const edata_t *edata) {

 static inline void
 edata_arena_ind_set(edata_t *edata, unsigned arena_ind) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK) |
-	    ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ARENA_MASK)
+	    | ((uint64_t)arena_ind << EDATA_BITS_ARENA_SHIFT);
 }

 static inline void
 edata_binshard_set(edata_t *edata, unsigned binshard) {
 	/* The assertion assumes szind is set already. */
 	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK) |
-	    ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_BINSHARD_MASK)
+	    | ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT);
 }

 static inline void
@ -451,8 +523,8 @@ edata_size_set(edata_t *edata, size_t size) {

 static inline void
 edata_esn_set(edata_t *edata, size_t esn) {
-	edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK) | (esn &
-	    EDATA_ESN_MASK);
+	edata->e_size_esn = (edata->e_size_esn & ~EDATA_ESN_MASK)
+	    | (esn & EDATA_ESN_MASK);
 }

 static inline void
@ -469,25 +541,26 @@ edata_ps_set(edata_t *edata, hpdata_t *ps) {
 static inline void
 edata_szind_set(edata_t *edata, szind_t szind) {
 	assert(szind <= SC_NSIZES); /* SC_NSIZES means "invalid". */
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK) |
-	    ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SZIND_MASK)
+	    | ((uint64_t)szind << EDATA_BITS_SZIND_SHIFT);
 }

 static inline void
 edata_nfree_set(edata_t *edata, unsigned nfree) {
 	assert(edata_slab_get(edata));
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK) |
-	    ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_NFREE_MASK)
+	    | ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
 }

 static inline void
 edata_nfree_binshard_set(edata_t *edata, unsigned nfree, unsigned binshard) {
 	/* The assertion assumes szind is set already. */
 	assert(binshard < bin_infos[edata_szind_get(edata)].n_shards);
-	edata->e_bits = (edata->e_bits &
-	    (~EDATA_BITS_NFREE_MASK & ~EDATA_BITS_BINSHARD_MASK)) |
-	    ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT) |
-	    ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
+	edata->e_bits = (edata->e_bits
+	                    & (~EDATA_BITS_NFREE_MASK
+	                        & ~EDATA_BITS_BINSHARD_MASK))
+	    | ((uint64_t)binshard << EDATA_BITS_BINSHARD_SHIFT)
+	    | ((uint64_t)nfree << EDATA_BITS_NFREE_SHIFT);
 }

 static inline void
@ -515,38 +588,38 @@ edata_sn_set(edata_t *edata, uint64_t sn) {

 static inline void
 edata_state_set(edata_t *edata, extent_state_t state) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK) |
-	    ((uint64_t)state << EDATA_BITS_STATE_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_STATE_MASK)
+	    | ((uint64_t)state << EDATA_BITS_STATE_SHIFT);
 }

 static inline void
 edata_guarded_set(edata_t *edata, bool guarded) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK) |
-	    ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_GUARDED_MASK)
+	    | ((uint64_t)guarded << EDATA_BITS_GUARDED_SHIFT);
 }

 static inline void
 edata_zeroed_set(edata_t *edata, bool zeroed) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK) |
-	    ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_ZEROED_MASK)
+	    | ((uint64_t)zeroed << EDATA_BITS_ZEROED_SHIFT);
 }

 static inline void
 edata_committed_set(edata_t *edata, bool committed) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK) |
-	    ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_COMMITTED_MASK)
+	    | ((uint64_t)committed << EDATA_BITS_COMMITTED_SHIFT);
 }

 static inline void
 edata_pai_set(edata_t *edata, extent_pai_t pai) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK) |
-	    ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_PAI_MASK)
+	    | ((uint64_t)pai << EDATA_BITS_PAI_SHIFT);
 }

 static inline void
 edata_slab_set(edata_t *edata, bool slab) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK) |
-	    ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_SLAB_MASK)
+	    | ((uint64_t)slab << EDATA_BITS_SLAB_SHIFT);
 }

 static inline void
@ -565,22 +638,22 @@ edata_prof_alloc_size_set(edata_t *edata, size_t size) {
 }

 static inline void
-edata_prof_recent_alloc_set_dont_call_directly(edata_t *edata,
-    prof_recent_t *recent_alloc) {
+edata_prof_recent_alloc_set_dont_call_directly(
+    edata_t *edata, prof_recent_t *recent_alloc) {
 	atomic_store_p(&edata->e_prof_info.e_prof_recent_alloc, recent_alloc,
 	    ATOMIC_RELAXED);
 }

 static inline bool
 edata_is_head_get(edata_t *edata) {
-	return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK) >>
-	    EDATA_BITS_IS_HEAD_SHIFT);
+	return (bool)((edata->e_bits & EDATA_BITS_IS_HEAD_MASK)
+	    >> EDATA_BITS_IS_HEAD_SHIFT);
 }

 static inline void
 edata_is_head_set(edata_t *edata, bool is_head) {
-	edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK) |
-	    ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
+	edata->e_bits = (edata->e_bits & ~EDATA_BITS_IS_HEAD_MASK)
+	    | ((uint64_t)is_head << EDATA_BITS_IS_HEAD_SHIFT);
 }

 static inline bool
@ -619,7 +692,8 @@ edata_init(edata_t *edata, unsigned arena_ind, void *addr, size_t size,
 }

 static inline void
-edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
+edata_binit(
+    edata_t *edata, void *addr, size_t bsize, uint64_t sn, bool reused) {
 	edata_arena_ind_set(edata, (1U << MALLOCX_ARENA_BITS) - 1);
 	edata_addr_set(edata, addr);
 	edata_bsize_set(edata, bsize);
@ -627,7 +701,8 @@ edata_binit(edata_t *edata, void *addr, size_t bsize, uint64_t sn) {
 	edata_szind_set(edata, SC_NSIZES);
 	edata_sn_set(edata, sn);
 	edata_state_set(edata, extent_state_active);
-	edata_guarded_set(edata, false);
+	/* See comments in base_edata_is_reused. */
+	edata_guarded_set(edata, reused);
 	edata_zeroed_set(edata, true);
 	edata_committed_set(edata, true);
 	/*
@ -656,20 +731,47 @@ edata_ead_comp(const edata_t *a, const edata_t *b) {

 static inline edata_cmp_summary_t
 edata_cmp_summary_get(const edata_t *edata) {
-	return (edata_cmp_summary_t){edata_sn_get(edata),
-		(uintptr_t)edata_addr_get(edata)};
+	edata_cmp_summary_t result;
+	result.sn = edata_sn_get(edata);
+	result.addr = (uintptr_t)edata_addr_get(edata);
+	return result;
+}
+
+#ifdef JEMALLOC_HAVE_INT128
+JEMALLOC_ALWAYS_INLINE unsigned __int128
+edata_cmp_summary_encode(edata_cmp_summary_t src) {
+	return ((unsigned __int128)src.sn << 64) | src.addr;
 }

 static inline int
 edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
-	int ret;
-	ret = (a.sn > b.sn) - (a.sn < b.sn);
-	if (ret != 0) {
-		return ret;
-	}
-	ret = (a.addr > b.addr) - (a.addr < b.addr);
-	return ret;
+	unsigned __int128 a_encoded = edata_cmp_summary_encode(a);
+	unsigned __int128 b_encoded = edata_cmp_summary_encode(b);
+	if (a_encoded < b_encoded)
+		return -1;
+	if (a_encoded == b_encoded)
+		return 0;
+	return 1;
 }
+#else
+static inline int
+edata_cmp_summary_comp(edata_cmp_summary_t a, edata_cmp_summary_t b) {
+	/*
+	 * Logically, what we're doing here is comparing based on `.sn`, and
+	 * falling back to comparing on `.addr` in the case that `a.sn == b.sn`.
+	 * We accomplish this by multiplying the result of the `.sn` comparison
+	 * by 2, so that so long as it is not 0, it will dominate the `.addr`
+	 * comparison in determining the sign of the returned result value.
+	 * The justification for doing things this way is that this is
+	 * branchless - all of the branches that would be present in a
+	 * straightforward implementation are common cases, and thus the branch
+	 * prediction accuracy is not great. As a result, this implementation
+	 * is measurably faster (by around 30%).
+	 */
+	return (2 * ((a.sn > b.sn) - (a.sn < b.sn)))
+	    + ((a.addr > b.addr) - (a.addr < b.addr));
+}
+#endif

 static inline int
 edata_snad_comp(const edata_t *a, const edata_t *b) {
@ -681,18 +783,13 @@ edata_snad_comp(const edata_t *a, const edata_t *b) {

 static inline int
 edata_esnead_comp(const edata_t *a, const edata_t *b) {
-	int ret;
-
-	ret = edata_esn_comp(a, b);
-	if (ret != 0) {
-		return ret;
-	}
-
-	ret = edata_ead_comp(a, b);
-	return ret;
+	/*
+	 * Similar to `edata_cmp_summary_comp`, we've opted for a
+	 * branchless implementation for the sake of performance.
+	 */
+	return (2 * edata_esn_comp(a, b)) + edata_ead_comp(a, b);
 }

-ph_proto(, edata_avail, edata_t)
-ph_proto(, edata_heap, edata_t)
+ph_proto(, edata_avail, edata_t) ph_proto(, edata_heap, edata_t)

 #endif /* JEMALLOC_INTERNAL_EDATA_H */
--- a/include/jemalloc/internal/edata_cache.h
+++ b/include/jemalloc/internal/edata_cache.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_EDATA_CACHE_H
 #define JEMALLOC_INTERNAL_EDATA_CACHE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/base.h"

 /* For tests only. */
@ -14,13 +15,13 @@

 typedef struct edata_cache_s edata_cache_t;
 struct edata_cache_s {
-	edata_avail_t avail;
-	atomic_zu_t count;
+	edata_avail_t  avail;
+	atomic_zu_t    count;
 	malloc_mutex_t mtx;
-	base_t *base;
+	base_t        *base;
 };

-bool edata_cache_init(edata_cache_t *edata_cache, base_t *base);
+bool     edata_cache_init(edata_cache_t *edata_cache, base_t *base);
 edata_t *edata_cache_get(tsdn_t *tsdn, edata_cache_t *edata_cache);
 void edata_cache_put(tsdn_t *tsdn, edata_cache_t *edata_cache, edata_t *edata);

@ -36,14 +37,14 @@ void edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache);
 typedef struct edata_cache_fast_s edata_cache_fast_t;
 struct edata_cache_fast_s {
 	edata_list_inactive_t list;
-	edata_cache_t *fallback;
-	bool disabled;
+	edata_cache_t        *fallback;
+	bool                  disabled;
 };

 void edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback);
 edata_t *edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs);
-void edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs,
-    edata_t *edata);
+void     edata_cache_fast_put(
+        tsdn_t *tsdn, edata_cache_fast_t *ecs, edata_t *edata);
 void edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs);

 #endif /* JEMALLOC_INTERNAL_EDATA_CACHE_H */
--- a/include/jemalloc/internal/ehooks.h
+++ b/include/jemalloc/internal/ehooks.h
@ -1,8 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_EHOOKS_H
 #define JEMALLOC_INTERNAL_EHOOKS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/extent_mmap.h"
+#include "jemalloc/internal/tsd.h"
+#include "jemalloc/internal/tsd_types.h"

 /*
 * This module is the internal interface to the extent hooks (both
@ -43,17 +46,17 @@ extern const extent_hooks_t ehooks_default_extent_hooks;
 */
 void *ehooks_default_alloc_impl(tsdn_t *tsdn, void *new_addr, size_t size,
    size_t alignment, bool *zero, bool *commit, unsigned arena_ind);
-bool ehooks_default_dalloc_impl(void *addr, size_t size);
-void ehooks_default_destroy_impl(void *addr, size_t size);
-bool ehooks_default_commit_impl(void *addr, size_t offset, size_t length);
-bool ehooks_default_decommit_impl(void *addr, size_t offset, size_t length);
+bool  ehooks_default_dalloc_impl(void *addr, size_t size);
+void  ehooks_default_destroy_impl(void *addr, size_t size);
+bool  ehooks_default_commit_impl(void *addr, size_t offset, size_t length);
+bool  ehooks_default_decommit_impl(void *addr, size_t offset, size_t length);
 #ifdef PAGES_CAN_PURGE_LAZY
 bool ehooks_default_purge_lazy_impl(void *addr, size_t offset, size_t length);
 #endif
 #ifdef PAGES_CAN_PURGE_FORCED
 bool ehooks_default_purge_forced_impl(void *addr, size_t offset, size_t length);
 #endif
-bool ehooks_default_split_impl();
+bool ehooks_default_split_impl(void);
 /*
 * Merge is the only default extent hook we declare -- see the comment in
 * ehooks_merge.
@ -113,8 +116,8 @@ ehooks_get_extent_hooks_ptr(ehooks_t *ehooks) {

 static inline bool
 ehooks_are_default(ehooks_t *ehooks) {
-	return ehooks_get_extent_hooks_ptr(ehooks) ==
-	    &ehooks_default_extent_hooks;
+	return ehooks_get_extent_hooks_ptr(ehooks)
+	    == &ehooks_default_extent_hooks;
 }

 /*
@ -186,16 +189,15 @@ ehooks_debug_zero_check(void *addr, size_t size) {
 	}
 }

-
 static inline void *
 ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size,
    size_t alignment, bool *zero, bool *commit) {
-	bool orig_zero = *zero;
-	void *ret;
+	bool            orig_zero = *zero;
+	void           *ret;
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
 	if (extent_hooks == &ehooks_default_extent_hooks) {
-		ret = ehooks_default_alloc_impl(tsdn, new_addr, size,
-		    alignment, zero, commit, ehooks_ind_get(ehooks));
+		ret = ehooks_default_alloc_impl(tsdn, new_addr, size, alignment,
+		    zero, commit, ehooks_ind_get(ehooks));
 	} else {
 		ehooks_pre_reentrancy(tsdn);
 		ret = extent_hooks->alloc(extent_hooks, new_addr, size,
@ -211,8 +213,8 @@ ehooks_alloc(tsdn_t *tsdn, ehooks_t *ehooks, void *new_addr, size_t size,
 }

 static inline bool
-ehooks_dalloc(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
-    bool committed) {
+ehooks_dalloc(
+    tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, bool committed) {
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
 	if (extent_hooks == &ehooks_default_extent_hooks) {
 		return ehooks_default_dalloc_impl(addr, size);
@ -228,8 +230,8 @@ ehooks_dalloc(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
 }

 static inline void
-ehooks_destroy(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
-    bool committed) {
+ehooks_destroy(
+    tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size, bool committed) {
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
 	if (extent_hooks == &ehooks_default_extent_hooks) {
 		ehooks_default_destroy_impl(addr, size);
@ -247,15 +249,15 @@ static inline bool
 ehooks_commit(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size,
    size_t offset, size_t length) {
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);
-	bool err;
+	bool            err;
 	if (extent_hooks == &ehooks_default_extent_hooks) {
 		err = ehooks_default_commit_impl(addr, offset, length);
 	} else if (extent_hooks->commit == NULL) {
 		err = true;
 	} else {
 		ehooks_pre_reentrancy(tsdn);
-		err = extent_hooks->commit(extent_hooks, addr, size,
-		    offset, length, ehooks_ind_get(ehooks));
+		err = extent_hooks->commit(extent_hooks, addr, size, offset,
+		    length, ehooks_ind_get(ehooks));
 		ehooks_post_reentrancy(tsdn);
 	}
 	if (!err) {
@ -381,7 +383,7 @@ ehooks_zero(tsdn_t *tsdn, ehooks_t *ehooks, void *addr, size_t size) {

 static inline bool
 ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
-	bool err;
+	bool            err;
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);

 	if (extent_hooks == &ehooks_default_extent_hooks) {
@ -396,7 +398,7 @@ ehooks_guard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {

 static inline bool
 ehooks_unguard(tsdn_t *tsdn, ehooks_t *ehooks, void *guard1, void *guard2) {
-	bool err;
+	bool            err;
 	extent_hooks_t *extent_hooks = ehooks_get_extent_hooks_ptr(ehooks);

 	if (extent_hooks == &ehooks_default_extent_hooks) {
--- a/include/jemalloc/internal/emap.h
+++ b/include/jemalloc/internal/emap.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_EMAP_H
 #define JEMALLOC_INTERNAL_EMAP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/base.h"
 #include "jemalloc/internal/rtree.h"

@ -9,9 +10,9 @@
 *     EMAP_DECLARE_RTREE_CTX;
 * in uses will avoid empty-statement warnings.
 */
-#define EMAP_DECLARE_RTREE_CTX						\
-    rtree_ctx_t rtree_ctx_fallback;					\
-    rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)
+#define EMAP_DECLARE_RTREE_CTX                                                 \
+	rtree_ctx_t  rtree_ctx_fallback;                                       \
+	rtree_ctx_t *rtree_ctx = tsdn_rtree_ctx(tsdn, &rtree_ctx_fallback)

 typedef struct emap_s emap_t;
 struct emap_s {
@ -19,26 +20,27 @@ struct emap_s {
 };

 /* Used to pass rtree lookup context down the path. */
-typedef struct emap_alloc_ctx_t emap_alloc_ctx_t;
-struct emap_alloc_ctx_t {
+typedef struct emap_alloc_ctx_s emap_alloc_ctx_t;
+struct emap_alloc_ctx_s {
+	size_t  usize;
 	szind_t szind;
-	bool slab;
+	bool    slab;
 };

 typedef struct emap_full_alloc_ctx_s emap_full_alloc_ctx_t;
 struct emap_full_alloc_ctx_s {
-	szind_t szind;
-	bool slab;
+	szind_t  szind;
+	bool     slab;
 	edata_t *edata;
 };

 bool emap_init(emap_t *emap, base_t *base, bool zeroed);

-void emap_remap(tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind,
-    bool slab);
+void emap_remap(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab);

-void emap_update_edata_state(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
-    extent_state_t state);
+void emap_update_edata_state(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, extent_state_t state);

 /*
 * The two acquire functions below allow accessing neighbor edatas, if it's safe
@ -60,16 +62,16 @@ edata_t *emap_try_acquire_edata_neighbor(tsdn_t *tsdn, emap_t *emap,
    bool forward);
 edata_t *emap_try_acquire_edata_neighbor_expand(tsdn_t *tsdn, emap_t *emap,
    edata_t *edata, extent_pai_t pai, extent_state_t expected_state);
-void emap_release_edata(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
-    extent_state_t new_state);
+void     emap_release_edata(
+        tsdn_t *tsdn, emap_t *emap, edata_t *edata, extent_state_t new_state);

 /*
 * Associate the given edata with its beginning and end address, setting the
 * szind and slab info appropriately.
 * Returns true on error (i.e. resource exhaustion).
 */
-bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
-    szind_t szind, bool slab);
+bool emap_register_boundary(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind, bool slab);

 /*
 * Does the same thing, but with the interior of the range, for slab
@ -90,8 +92,8 @@ bool emap_register_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
 * touched, so no allocation is necessary to fill the interior once the boundary
 * has been touched.
 */
-void emap_register_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata,
-    szind_t szind);
+void emap_register_interior(
+    tsdn_t *tsdn, emap_t *emap, edata_t *edata, szind_t szind);

 void emap_deregister_boundary(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
 void emap_deregister_interior(tsdn_t *tsdn, emap_t *emap, edata_t *edata);
@ -159,8 +161,8 @@ emap_edata_in_transition(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
 	emap_assert_mapped(tsdn, emap, edata);

 	EMAP_DECLARE_RTREE_CTX;
-	rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
-	    (uintptr_t)edata_base_get(edata));
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)edata_base_get(edata));

 	return edata_state_in_transition(contents.metadata.state);
 }
@ -185,16 +187,16 @@ emap_edata_is_acquired(tsdn_t *tsdn, emap_t *emap, edata_t *edata) {
 	 */
 	EMAP_DECLARE_RTREE_CTX;
 	rtree_leaf_elm_t *elm = rtree_leaf_elm_lookup(tsdn, &emap->rtree,
-	    rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ true,
+	    rtree_ctx, (uintptr_t)edata_base_get(edata), /* dependent */ false,
 	    /* init_missing */ false);
 	if (elm == NULL) {
 		return true;
 	}
 	rtree_contents_t contents = rtree_leaf_elm_read(tsdn, &emap->rtree, elm,
-	    /* dependent */ true);
-	if (contents.edata == NULL ||
-	    contents.metadata.state == extent_state_active ||
-	    edata_state_in_transition(contents.metadata.state)) {
+	    /* dependent */ false);
+	if (contents.edata == NULL
+	    || contents.metadata.state == extent_state_active
+	    || edata_state_in_transition(contents.metadata.state)) {
 		return true;
 	}

@ -209,8 +211,8 @@ extent_assert_can_coalesce(const edata_t *inner, const edata_t *outer) {
 	assert(edata_state_get(inner) == extent_state_active);
 	assert(edata_state_get(outer) == extent_state_merging);
 	assert(!edata_guarded_get(inner) && !edata_guarded_get(outer));
-	assert(edata_base_get(inner) == edata_past_get(outer) ||
-	    edata_base_get(outer) == edata_past_get(inner));
+	assert(edata_base_get(inner) == edata_past_get(outer)
+	    || edata_base_get(outer) == edata_past_get(inner));
 }

 JEMALLOC_ALWAYS_INLINE void
@ -229,16 +231,46 @@ emap_edata_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr) {
 	return rtree_read(tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr).edata;
 }

+JEMALLOC_ALWAYS_INLINE void
+emap_alloc_ctx_init(
+    emap_alloc_ctx_t *alloc_ctx, szind_t szind, bool slab, size_t usize) {
+	alloc_ctx->szind = szind;
+	alloc_ctx->slab = slab;
+	alloc_ctx->usize = usize;
+	assert(
+	    sz_large_size_classes_disabled() || usize == sz_index2size(szind));
+}
+
+JEMALLOC_ALWAYS_INLINE size_t
+emap_alloc_ctx_usize_get(emap_alloc_ctx_t *alloc_ctx) {
+	assert(alloc_ctx->szind < SC_NSIZES);
+	if (alloc_ctx->slab) {
+		assert(alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+		return sz_index2size(alloc_ctx->szind);
+	}
+	assert(sz_large_size_classes_disabled()
+	    || alloc_ctx->usize == sz_index2size(alloc_ctx->szind));
+	assert(alloc_ctx->usize <= SC_LARGE_MAXCLASS);
+	return alloc_ctx->usize;
+}
+
 /* Fills in alloc_ctx with the info in the map. */
 JEMALLOC_ALWAYS_INLINE void
-emap_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
-    emap_alloc_ctx_t *alloc_ctx) {
+emap_alloc_ctx_lookup(
+    tsdn_t *tsdn, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
 	EMAP_DECLARE_RTREE_CTX;

-	rtree_metadata_t metadata = rtree_metadata_read(tsdn, &emap->rtree,
-	    rtree_ctx, (uintptr_t)ptr);
-	alloc_ctx->szind = metadata.szind;
-	alloc_ctx->slab = metadata.slab;
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr);
+	/*
+	 * If the alloc is invalid, do not calculate usize since edata
+	 * could be corrupted.
+	 */
+	emap_alloc_ctx_init(alloc_ctx, contents.metadata.szind,
+	    contents.metadata.slab,
+	    (contents.metadata.szind == SC_NSIZES || contents.edata == NULL)
+	        ? 0
+	        : edata_usize_get(contents.edata));
 }

 /* The pointer must be mapped. */
@ -247,8 +279,8 @@ emap_full_alloc_ctx_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
    emap_full_alloc_ctx_t *full_alloc_ctx) {
 	EMAP_DECLARE_RTREE_CTX;

-	rtree_contents_t contents = rtree_read(tsdn, &emap->rtree, rtree_ctx,
-	    (uintptr_t)ptr);
+	rtree_contents_t contents = rtree_read(
+	    tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr);
 	full_alloc_ctx->edata = contents.edata;
 	full_alloc_ctx->szind = contents.metadata.szind;
 	full_alloc_ctx->slab = contents.metadata.slab;
@ -265,8 +297,8 @@ emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
 	EMAP_DECLARE_RTREE_CTX;

 	rtree_contents_t contents;
-	bool err = rtree_read_independent(tsdn, &emap->rtree, rtree_ctx,
-	    (uintptr_t)ptr, &contents);
+	bool             err = rtree_read_independent(
+            tsdn, &emap->rtree, rtree_ctx, (uintptr_t)ptr, &contents);
 	if (err) {
 		return true;
 	}
@ -281,19 +313,26 @@ emap_full_alloc_ctx_try_lookup(tsdn_t *tsdn, emap_t *emap, const void *ptr,
 * fast path, e.g. when the metadata key is not cached.
 */
 JEMALLOC_ALWAYS_INLINE bool
-emap_alloc_ctx_try_lookup_fast(tsd_t *tsd, emap_t *emap, const void *ptr,
-    emap_alloc_ctx_t *alloc_ctx) {
+emap_alloc_ctx_try_lookup_fast(
+    tsd_t *tsd, emap_t *emap, const void *ptr, emap_alloc_ctx_t *alloc_ctx) {
 	/* Use the unsafe getter since this may gets called during exit. */
 	rtree_ctx_t *rtree_ctx = tsd_rtree_ctxp_get_unsafe(tsd);

 	rtree_metadata_t metadata;
-	bool err = rtree_metadata_try_read_fast(tsd_tsdn(tsd), &emap->rtree,
-	    rtree_ctx, (uintptr_t)ptr, &metadata);
+	bool             err = rtree_metadata_try_read_fast(
+            tsd_tsdn(tsd), &emap->rtree, rtree_ctx, (uintptr_t)ptr, &metadata);
 	if (err) {
 		return true;
 	}
+	/*
+	 * Small allocs using the fastpath can always use index to get the
+	 * usize.  Therefore, do not set alloc_ctx->usize here.
+	 */
 	alloc_ctx->szind = metadata.szind;
 	alloc_ctx->slab = metadata.slab;
+	if (config_debug) {
+		alloc_ctx->usize = SC_LARGE_MAXCLASS + 1;
+	}
 	return false;
 }

@ -308,11 +347,12 @@ typedef const void *(*emap_ptr_getter)(void *ctx, size_t ind);
 * This allows size-checking assertions, which we can only do while we're in the
 * process of edata lookups.
 */
-typedef void (*emap_metadata_visitor)(void *ctx, emap_full_alloc_ctx_t *alloc_ctx);
+typedef void (*emap_metadata_visitor)(
+    void *ctx, emap_full_alloc_ctx_t *alloc_ctx);

 typedef union emap_batch_lookup_result_u emap_batch_lookup_result_t;
 union emap_batch_lookup_result_u {
-	edata_t *edata;
+	edata_t          *edata;
 	rtree_leaf_elm_t *rtree_leaf;
 };

@ -338,8 +378,8 @@ emap_edata_lookup_batch(tsd_t *tsd, emap_t *emap, size_t nptrs,

 	for (size_t i = 0; i < nptrs; i++) {
 		rtree_leaf_elm_t *elm = result[i].rtree_leaf;
-		rtree_contents_t contents = rtree_leaf_elm_read(tsd_tsdn(tsd),
-		    &emap->rtree, elm, /* dependent */ true);
+		rtree_contents_t  contents = rtree_leaf_elm_read(
+                    tsd_tsdn(tsd), &emap->rtree, elm, /* dependent */ true);
 		result[i].edata = contents.edata;
 		emap_full_alloc_ctx_t alloc_ctx;
 		/*
--- a/include/jemalloc/internal/emitter.h
+++ b/include/jemalloc/internal/emitter.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_EMITTER_H
 #define JEMALLOC_INTERNAL_EMITTER_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/jemalloc_internal_types.h"
+#include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/ql.h"

 typedef enum emitter_output_e emitter_output_t;
@ -40,18 +44,18 @@ typedef struct emitter_col_s emitter_col_t;
 struct emitter_col_s {
 	/* Filled in by the user. */
 	emitter_justify_t justify;
-	int width;
-	emitter_type_t type;
+	int               width;
+	emitter_type_t    type;
 	union {
-		bool bool_val;
-		int int_val;
-		unsigned unsigned_val;
-		uint32_t uint32_val;
-		uint32_t uint32_t_val;
-		uint64_t uint64_val;
-		uint64_t uint64_t_val;
-		size_t size_val;
-		ssize_t ssize_val;
+		bool        bool_val;
+		int         int_val;
+		unsigned    unsigned_val;
+		uint32_t    uint32_val;
+		uint32_t    uint32_t_val;
+		uint64_t    uint64_val;
+		uint64_t    uint64_t_val;
+		size_t      size_val;
+		ssize_t     ssize_val;
 		const char *str_val;
 	};

@ -69,8 +73,8 @@ struct emitter_s {
 	emitter_output_t output;
 	/* The output information. */
 	write_cb_t *write_cb;
-	void *cbopaque;
-	int nesting_depth;
+	void       *cbopaque;
+	int         nesting_depth;
 	/* True if we've already emitted a value at the given depth. */
 	bool item_at_depth;
 	/* True if we emitted a key and will emit corresponding value next. */
@ -79,8 +83,8 @@ struct emitter_s {

 static inline bool
 emitter_outputs_json(emitter_t *emitter) {
-	return emitter->output == emitter_output_json ||
-	    emitter->output == emitter_output_json_compact;
+	return emitter->output == emitter_output_json
+	    || emitter->output == emitter_output_json_compact;
 }

 /* Internal convenience function.  Write to the emitter the given string. */
@ -94,26 +98,57 @@ emitter_printf(emitter_t *emitter, const char *format, ...) {
 	va_end(ap);
 }

-static inline const char * JEMALLOC_FORMAT_ARG(3)
-emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
-    emitter_justify_t justify, int width) {
+static inline const char *
+JEMALLOC_FORMAT_ARG(3) emitter_gen_fmt(char *out_fmt, size_t out_size,
+    const char *fmt_specifier, emitter_justify_t justify, int width) {
 	size_t written;
 	fmt_specifier++;
 	if (justify == emitter_justify_none) {
-		written = malloc_snprintf(out_fmt, out_size,
-		    "%%%s", fmt_specifier);
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%%s", fmt_specifier);
 	} else if (justify == emitter_justify_left) {
-		written = malloc_snprintf(out_fmt, out_size,
-		    "%%-%d%s", width, fmt_specifier);
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%-%d%s", width, fmt_specifier);
 	} else {
-		written = malloc_snprintf(out_fmt, out_size,
-		    "%%%d%s", width, fmt_specifier);
+		written = malloc_snprintf(
+		    out_fmt, out_size, "%%%d%s", width, fmt_specifier);
 	}
 	/* Only happens in case of bad format string, which *we* choose. */
-	assert(written <  out_size);
+	assert(written < out_size);
 	return out_fmt;
 }

+static inline void
+emitter_emit_str(emitter_t *emitter, emitter_justify_t justify, int width,
+    char *fmt, size_t fmt_size, const char *str) {
+#define BUF_SIZE 256
+	char   buf[BUF_SIZE];
+	size_t str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"", str);
+	emitter_printf(
+	    emitter, emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf);
+	if (str_written < BUF_SIZE) {
+		return;
+	}
+	/*
+	 * There is no support for long string justification at the moment as
+	 * we output them partially with multiple malloc_snprintf calls and
+	 * justufication will work correctly only withing one call.
+	 * Fortunately this is not a big concern as we don't use justufication
+	 * with long strings right now.
+	 *
+	 * We emitted leading quotation mark and trailing '\0', hence need to
+	 * exclude extra characters from str shift.
+	 */
+	str += BUF_SIZE - 2;
+	do {
+		str_written = malloc_snprintf(buf, BUF_SIZE, "%s\"", str);
+		str += str_written >= BUF_SIZE ? BUF_SIZE - 1 : str_written;
+		emitter_printf(emitter,
+		    emitter_gen_fmt(fmt, fmt_size, "%s", justify, width), buf);
+	} while (str_written >= BUF_SIZE);
+#undef BUF_SIZE
+}
+
 /*
 * Internal.  Emit the given value type in the relevant encoding (so that the
 * bool true gets mapped to json "true", but the string "true" gets mapped to
@ -124,8 +159,6 @@ emitter_gen_fmt(char *out_fmt, size_t out_size, const char *fmt_specifier,
 static inline void
 emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
    emitter_type_t value_type, const void *value) {
-	size_t str_written;
-#define BUF_SIZE 256
 #define FMT_SIZE 10
 	/*
 	 * We dynamically generate a format string to emit, to let us use the
@ -134,18 +167,17 @@ emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
 	 * cases.
 	 */
 	char fmt[FMT_SIZE];
-	char buf[BUF_SIZE];

-#define EMIT_SIMPLE(type, format)					\
-	emitter_printf(emitter,						\
-	    emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width),	\
+#define EMIT_SIMPLE(type, format)                                              \
+	emitter_printf(emitter,                                                \
+	    emitter_gen_fmt(fmt, FMT_SIZE, format, justify, width),            \
 	    *(const type *)value);

 	switch (value_type) {
 	case emitter_type_bool:
 		emitter_printf(emitter,
 		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width),
-		    *(const bool *)value ?  "true" : "false");
+		    *(const bool *)value ? "true" : "false");
 		break;
 	case emitter_type_int:
 		EMIT_SIMPLE(int, "%d")
@ -163,15 +195,8 @@ emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
 		EMIT_SIMPLE(size_t, "%zu")
 		break;
 	case emitter_type_string:
-		str_written = malloc_snprintf(buf, BUF_SIZE, "\"%s\"",
+		emitter_emit_str(emitter, justify, width, fmt, FMT_SIZE,
 		    *(const char *const *)value);
-		/*
-		 * We control the strings we output; we shouldn't get anything
-		 * anywhere near the fmt size.
-		 */
-		assert(str_written < BUF_SIZE);
-		emitter_printf(emitter,
-		    emitter_gen_fmt(fmt, FMT_SIZE, "%s", justify, width), buf);
 		break;
 	case emitter_type_uint32:
 		EMIT_SIMPLE(uint32_t, "%" FMTu32)
@ -185,11 +210,9 @@ emitter_print_value(emitter_t *emitter, emitter_justify_t justify, int width,
 	default:
 		unreachable();
 	}
-#undef BUF_SIZE
 #undef FMT_SIZE
 }

-
 /* Internal functions.  In json mode, tracks nesting state. */
 static inline void
 emitter_nest_inc(emitter_t *emitter) {
@ -205,7 +228,7 @@ emitter_nest_dec(emitter_t *emitter) {

 static inline void
 emitter_indent(emitter_t *emitter) {
-	int amount = emitter->nesting_depth;
+	int         amount = emitter->nesting_depth;
 	const char *indent_str;
 	assert(emitter->output != emitter_output_json_compact);
 	if (emitter->output == emitter_output_json) {
@ -267,12 +290,12 @@ emitter_json_key(emitter_t *emitter, const char *json_key) {
 }

 static inline void
-emitter_json_value(emitter_t *emitter, emitter_type_t value_type,
-    const void *value) {
+emitter_json_value(
+    emitter_t *emitter, emitter_type_t value_type, const void *value) {
 	if (emitter_outputs_json(emitter)) {
 		emitter_json_key_prefix(emitter);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
+		emitter_print_value(
+		    emitter, emitter_justify_none, -1, value_type, value);
 		emitter->item_at_depth = true;
 	}
 }
@ -343,7 +366,6 @@ emitter_json_object_end(emitter_t *emitter) {
 	}
 }

-
 /******************************************************************************/
 /* Table public API. */

@ -365,14 +387,13 @@ emitter_table_dict_end(emitter_t *emitter) {

 static inline void
 emitter_table_kv_note(emitter_t *emitter, const char *table_key,
-    emitter_type_t value_type, const void *value,
-    const char *table_note_key, emitter_type_t table_note_value_type,
-    const void *table_note_value) {
+    emitter_type_t value_type, const void *value, const char *table_note_key,
+    emitter_type_t table_note_value_type, const void *table_note_value) {
 	if (emitter->output == emitter_output_table) {
 		emitter_indent(emitter);
 		emitter_printf(emitter, "%s: ", table_key);
-		emitter_print_value(emitter, emitter_justify_none, -1,
-		    value_type, value);
+		emitter_print_value(
+		    emitter, emitter_justify_none, -1, value_type, value);
 		if (table_note_key != NULL) {
 			emitter_printf(emitter, " (%s: ", table_note_key);
 			emitter_print_value(emitter, emitter_justify_none, -1,
@ -391,7 +412,6 @@ emitter_table_kv(emitter_t *emitter, const char *table_key,
 	    emitter_type_bool, NULL);
 }

-
 /* Write to the emitter the given string, but only in table mode. */
 JEMALLOC_FORMAT_PRINTF(2, 3)
 static inline void
@ -399,7 +419,8 @@ emitter_table_printf(emitter_t *emitter, const char *format, ...) {
 	if (emitter->output == emitter_output_table) {
 		va_list ap;
 		va_start(ap, format);
-		malloc_vcprintf(emitter->write_cb, emitter->cbopaque, format, ap);
+		malloc_vcprintf(
+		    emitter->write_cb, emitter->cbopaque, format, ap);
 		va_end(ap);
 	}
 }
@ -410,7 +431,7 @@ emitter_table_row(emitter_t *emitter, emitter_row_t *row) {
 		return;
 	}
 	emitter_col_t *col;
-	ql_foreach(col, &row->cols, link) {
+	ql_foreach (col, &row->cols, link) {
 		emitter_print_value(emitter, col->justify, col->width,
 		    col->type, (const void *)&col->bool_val);
 	}
@ -428,7 +449,6 @@ emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
 	ql_tail_insert(&row->cols, col, link);
 }

-
 /******************************************************************************/
 /*
 * Generalized public API. Emits using either JSON or table, according to
@ -440,9 +460,8 @@ emitter_col_init(emitter_col_t *col, emitter_row_t *row) {
 */
 static inline void
 emitter_kv_note(emitter_t *emitter, const char *json_key, const char *table_key,
-    emitter_type_t value_type, const void *value,
-    const char *table_note_key, emitter_type_t table_note_value_type,
-    const void *table_note_value) {
+    emitter_type_t value_type, const void *value, const char *table_note_key,
+    emitter_type_t table_note_value_type, const void *table_note_value) {
 	if (emitter_outputs_json(emitter)) {
 		emitter_json_key(emitter, json_key);
 		emitter_json_value(emitter, value_type, value);
@ -461,8 +480,8 @@ emitter_kv(emitter_t *emitter, const char *json_key, const char *table_key,
 }

 static inline void
-emitter_dict_begin(emitter_t *emitter, const char *json_key,
-    const char *table_header) {
+emitter_dict_begin(
+    emitter_t *emitter, const char *json_key, const char *table_header) {
 	if (emitter_outputs_json(emitter)) {
 		emitter_json_key(emitter, json_key);
 		emitter_json_object_begin(emitter);
@ -502,8 +521,9 @@ emitter_end(emitter_t *emitter) {
 	if (emitter_outputs_json(emitter)) {
 		assert(emitter->nesting_depth == 1);
 		emitter_nest_dec(emitter);
-		emitter_printf(emitter, "%s", emitter->output ==
-		    emitter_output_json_compact ? "}" : "\n}\n");
+		emitter_printf(emitter, "%s",
+		    emitter->output == emitter_output_json_compact ? "}"
+		                                                   : "\n}\n");
 	}
 }

--- a/include/jemalloc/internal/eset.h
+++ b/include/jemalloc/internal/eset.h
@ -1,9 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_ESET_H
 #define JEMALLOC_INTERNAL_ESET_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
-#include "jemalloc/internal/fb.h"
 #include "jemalloc/internal/edata.h"
+#include "jemalloc/internal/fb.h"
 #include "jemalloc/internal/mutex.h"

 /*
--- a/include/jemalloc/internal/exp_grow.h
+++ b/include/jemalloc/internal/exp_grow.h
@ -1,6 +1,8 @@
 #ifndef JEMALLOC_INTERNAL_EXP_GROW_H
 #define JEMALLOC_INTERNAL_EXP_GROW_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/sz.h"
 typedef struct exp_grow_s exp_grow_t;
 struct exp_grow_s {
 	/*
@ -25,8 +27,7 @@ exp_grow_size_prepare(exp_grow_t *exp_grow, size_t alloc_size_min,
 	*r_alloc_size = sz_pind2sz(exp_grow->next + *r_skip);
 	while (*r_alloc_size < alloc_size_min) {
 		(*r_skip)++;
-		if (exp_grow->next + *r_skip  >=
-		    sz_psz2ind(SC_LARGE_MAXCLASS)) {
+		if (exp_grow->next + *r_skip >= sz_psz2ind(SC_LARGE_MAXCLASS)) {
 			/* Outside legal range. */
 			return true;
 		}
@ -42,7 +43,6 @@ exp_grow_size_commit(exp_grow_t *exp_grow, pszind_t skip) {
 	} else {
 		exp_grow->next = exp_grow->limit;
 	}
-
 }

 void exp_grow_init(exp_grow_t *exp_grow);
--- a/include/jemalloc/internal/extent.h
+++ b/include/jemalloc/internal/extent.h
@ -1,8 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_H
 #define JEMALLOC_INTERNAL_EXTENT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/ecache.h"
 #include "jemalloc/internal/ehooks.h"
+#include "jemalloc/internal/pac.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/rtree.h"

@ -19,50 +21,58 @@
 #define LG_EXTENT_MAX_ACTIVE_FIT_DEFAULT 6
 extern size_t opt_lg_extent_max_active_fit;

+#define PROCESS_MADVISE_MAX_BATCH_DEFAULT 0
+extern size_t opt_process_madvise_max_batch;
+
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+/* The iovec is on stack.  Limit the max batch to avoid stack overflow. */
+#	define PROCESS_MADVISE_MAX_BATCH_LIMIT                                \
+		(VARIABLE_ARRAY_SIZE_MAX / sizeof(struct iovec))
+#else
+#	define PROCESS_MADVISE_MAX_BATCH_LIMIT 0
+#endif
+
 edata_t *ecache_alloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
    bool zero, bool guarded);
 edata_t *ecache_alloc_grow(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
    ecache_t *ecache, edata_t *expand_edata, size_t size, size_t alignment,
    bool zero, bool guarded);
-void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
-    ecache_t *ecache, edata_t *edata);
+void ecache_dalloc(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
+    edata_t *edata);
 edata_t *ecache_evict(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
    ecache_t *ecache, size_t npages_min);

 void extent_gdump_add(tsdn_t *tsdn, const edata_t *edata);
 void extent_record(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, ecache_t *ecache,
    edata_t *edata);
-void extent_dalloc_gap(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
-    edata_t *edata);
+void extent_dalloc_gap(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
 edata_t *extent_alloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
    void *new_addr, size_t size, size_t alignment, bool zero, bool *commit,
    bool growing_retained);
-void extent_dalloc_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
-    edata_t *edata);
-void extent_destroy_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
-    edata_t *edata);
-bool extent_commit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
-    size_t offset, size_t length);
-bool extent_decommit_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
-    size_t offset, size_t length);
+void     extent_dalloc_wrapper(
+        tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+void extent_dalloc_wrapper_purged(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
+void extent_destroy_wrapper(
+    tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *edata);
 bool extent_purge_lazy_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
    size_t offset, size_t length);
 bool extent_purge_forced_wrapper(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
    size_t offset, size_t length);
-edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac,
-    ehooks_t *ehooks, edata_t *edata, size_t size_a, size_t size_b,
-    bool holding_core_locks);
-bool extent_merge_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
-    edata_t *a, edata_t *b);
-bool extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
-    bool commit, bool zero, bool growing_retained);
+edata_t *extent_split_wrapper(tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks,
+    edata_t *edata, size_t size_a, size_t size_b, bool holding_core_locks);
+bool     extent_merge_wrapper(
+        tsdn_t *tsdn, pac_t *pac, ehooks_t *ehooks, edata_t *a, edata_t *b);
+bool   extent_commit_zero(tsdn_t *tsdn, ehooks_t *ehooks, edata_t *edata,
+      bool commit, bool zero, bool growing_retained);
 size_t extent_sn_next(pac_t *pac);
-bool extent_boot(void);
+bool   extent_boot(void);

 JEMALLOC_ALWAYS_INLINE bool
-extent_neighbor_head_state_mergeable(bool edata_is_head,
-    bool neighbor_is_head, bool forward) {
+extent_neighbor_head_state_mergeable(
+    bool edata_is_head, bool neighbor_is_head, bool forward) {
 	/*
 	 * Head states checking: disallow merging if the higher addr extent is a
 	 * head extent.  This helps preserve first-fit, and more importantly
@ -90,8 +100,8 @@ extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
 	}
 	/* It's not safe to access *neighbor yet; must verify states first. */
 	bool neighbor_is_head = contents.metadata.is_head;
-	if (!extent_neighbor_head_state_mergeable(edata_is_head_get(edata),
-	    neighbor_is_head, forward)) {
+	if (!extent_neighbor_head_state_mergeable(
+	        edata_is_head_get(edata), neighbor_is_head, forward)) {
 		return false;
 	}
 	extent_state_t neighbor_state = contents.metadata.state;
@ -100,8 +110,9 @@ extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
 			return false;
 		}
 		/* From this point, it's safe to access *neighbor. */
-		if (!expanding && (edata_committed_get(edata) !=
-		    edata_committed_get(neighbor))) {
+		if (!expanding
+		    && (edata_committed_get(edata)
+		        != edata_committed_get(neighbor))) {
 			/*
 			 * Some platforms (e.g. Windows) require an explicit
 			 * commit step (and writing to uncommitted memory is not
@ -121,11 +132,11 @@ extent_can_acquire_neighbor(edata_t *edata, rtree_contents_t contents,
 		return false;
 	}
 	if (opt_retain) {
-		assert(edata_arena_ind_get(edata) ==
-		    edata_arena_ind_get(neighbor));
+		assert(edata_arena_ind_get(edata)
+		    == edata_arena_ind_get(neighbor));
 	} else {
-		if (edata_arena_ind_get(edata) !=
-		    edata_arena_ind_get(neighbor)) {
+		if (edata_arena_ind_get(edata)
+		    != edata_arena_ind_get(neighbor)) {
 			return false;
 		}
 	}
--- a/include/jemalloc/internal/extent_dss.h
+++ b/include/jemalloc/internal/extent_dss.h
@ -1,26 +1,30 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_DSS_H
 #define JEMALLOC_INTERNAL_EXTENT_DSS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_types.h"
+#include "jemalloc/internal/tsd_types.h"
+
 typedef enum {
-	dss_prec_disabled  = 0,
-	dss_prec_primary   = 1,
+	dss_prec_disabled = 0,
+	dss_prec_primary = 1,
 	dss_prec_secondary = 2,

-	dss_prec_limit     = 3
+	dss_prec_limit = 3
 } dss_prec_t;
 #define DSS_PREC_DEFAULT dss_prec_secondary
 #define DSS_DEFAULT "secondary"

-extern const char *dss_prec_names[];
+extern const char *const dss_prec_names[];

 extern const char *opt_dss;

 dss_prec_t extent_dss_prec_get(void);
-bool extent_dss_prec_set(dss_prec_t dss_prec);
-void *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
-    size_t size, size_t alignment, bool *zero, bool *commit);
-bool extent_in_dss(void *addr);
-bool extent_dss_mergeable(void *addr_a, void *addr_b);
-void extent_dss_boot(void);
+bool       extent_dss_prec_set(dss_prec_t dss_prec);
+void      *extent_alloc_dss(tsdn_t *tsdn, arena_t *arena, void *new_addr,
+         size_t size, size_t alignment, bool *zero, bool *commit);
+bool       extent_in_dss(void *addr);
+bool       extent_dss_mergeable(void *addr_a, void *addr_b);
+void       extent_dss_boot(void);

 #endif /* JEMALLOC_INTERNAL_EXTENT_DSS_H */
--- a/include/jemalloc/internal/extent_mmap.h
+++ b/include/jemalloc/internal/extent_mmap.h
@ -1,10 +1,12 @@
 #ifndef JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H
 #define JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+
 extern bool opt_retain;

-void *extent_alloc_mmap(void *new_addr, size_t size, size_t alignment,
-    bool *zero, bool *commit);
+void *extent_alloc_mmap(
+    void *new_addr, size_t size, size_t alignment, bool *zero, bool *commit);
 bool extent_dalloc_mmap(void *addr, size_t size);

 #endif /* JEMALLOC_INTERNAL_EXTENT_MMAP_EXTERNS_H */
--- a/include/jemalloc/internal/fb.h
+++ b/include/jemalloc/internal/fb.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_FB_H
 #define JEMALLOC_INTERNAL_FB_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+#include "jemalloc/internal/bit_util.h"
+
 /*
 * The flat bitmap module.  This has a larger API relative to the bitmap module
 * (supporting things like backwards searches, and searching for both set and
@ -11,8 +15,8 @@

 typedef unsigned long fb_group_t;
 #define FB_GROUP_BITS (ZU(1) << (LG_SIZEOF_LONG + 3))
-#define FB_NGROUPS(nbits) ((nbits) / FB_GROUP_BITS \
-    + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1))
+#define FB_NGROUPS(nbits)                                                      \
+	((nbits) / FB_GROUP_BITS + ((nbits) % FB_GROUP_BITS == 0 ? 0 : 1))

 static inline void
 fb_init(fb_group_t *fb, size_t nbits) {
@ -71,7 +75,6 @@ fb_unset(fb_group_t *fb, size_t nbits, size_t bit) {
 	fb[group_ind] &= ~((fb_group_t)1 << bit_ind);
 }

-
 /*
 * Some implementation details.  This visitation function lets us apply a group
 * visitor to each group in the bitmap (potentially modifying it).  The mask
@ -90,7 +93,8 @@ fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx,
 	 * to from bit 0.
 	 */
 	size_t first_group_cnt = (start_bit_ind + cnt > FB_GROUP_BITS
-		? FB_GROUP_BITS - start_bit_ind : cnt);
+	        ? FB_GROUP_BITS - start_bit_ind
+	        : cnt);
 	/*
 	 * We can basically split affected words into:
 	 *   - The first group, where we touch only the high bits
@ -100,8 +104,8 @@ fb_visit_impl(fb_group_t *fb, size_t nbits, fb_group_visitor_t visit, void *ctx,
 	 * this can lead to bad codegen for those middle words.
 	 */
 	/* First group */
-	fb_group_t mask = ((~(fb_group_t)0)
-	    >> (FB_GROUP_BITS - first_group_cnt))
+	fb_group_t mask =
+	    ((~(fb_group_t)0) >> (FB_GROUP_BITS - first_group_cnt))
 	    << start_bit_ind;
 	visit(ctx, &fb[group_ind], mask);

@ -172,12 +176,12 @@ fb_ucount(fb_group_t *fb, size_t nbits, size_t start, size_t cnt) {
 * Returns the number of bits in the bitmap if no such bit exists.
 */
 JEMALLOC_ALWAYS_INLINE ssize_t
-fb_find_impl(fb_group_t *fb, size_t nbits, size_t start, bool val,
-    bool forward) {
+fb_find_impl(
+    fb_group_t *fb, size_t nbits, size_t start, bool val, bool forward) {
 	assert(start < nbits);
-	size_t ngroups = FB_NGROUPS(nbits);
+	size_t  ngroups = FB_NGROUPS(nbits);
 	ssize_t group_ind = start / FB_GROUP_BITS;
-	size_t bit_ind = start % FB_GROUP_BITS;
+	size_t  bit_ind = start % FB_GROUP_BITS;

 	fb_group_t maybe_invert = (val ? 0 : (fb_group_t)-1);

@ -261,8 +265,8 @@ fb_iter_range_impl(fb_group_t *fb, size_t nbits, size_t start, size_t *r_begin,
 		return false;
 	}
 	/* Half open range; the set bits are [begin, end). */
-	ssize_t next_range_end = fb_find_impl(fb, nbits, next_range_begin, !val,
-	    forward);
+	ssize_t next_range_end = fb_find_impl(
+	    fb, nbits, next_range_begin, !val, forward);
 	if (forward) {
 		*r_begin = next_range_begin;
 		*r_len = next_range_end - next_range_begin;
@ -320,8 +324,9 @@ fb_range_longest_impl(fb_group_t *fb, size_t nbits, bool val) {
 	size_t begin = 0;
 	size_t longest_len = 0;
 	size_t len = 0;
-	while (begin < nbits && fb_iter_range_impl(fb, nbits, begin, &begin,
-	    &len, val, /* forward */ true)) {
+	while (begin < nbits
+	    && fb_iter_range_impl(
+	        fb, nbits, begin, &begin, &len, val, /* forward */ true)) {
 		if (len > longest_len) {
 			longest_len = len;
 		}
--- a/include/jemalloc/internal/fxp.h
+++ b/include/jemalloc/internal/fxp.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_FXP_H
 #define JEMALLOC_INTERNAL_FXP_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
 /*
 * A simple fixed-point math implementation, supporting only unsigned values
 * (with overflow being an error).
@ -86,7 +89,7 @@ fxp_round_down(fxp_t a) {

 static inline uint32_t
 fxp_round_nearest(fxp_t a) {
-	uint32_t fractional_part = (a  & ((1U << 16) - 1));
+	uint32_t fractional_part = (a & ((1U << 16) - 1));
 	uint32_t increment = (uint32_t)(fractional_part >= (1U << 15));
 	return (a >> 16) + increment;
 }
--- a/include/jemalloc/internal/hash.h
+++ b/include/jemalloc/internal/hash.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_HASH_H
 #define JEMALLOC_INTERNAL_HASH_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/assert.h"

 /*
@ -24,7 +25,7 @@ hash_rotl_64(uint64_t x, int8_t r) {
 static inline uint32_t
 hash_get_block_32(const uint32_t *p, int i) {
 	/* Handle unaligned read. */
-	if (unlikely((uintptr_t)p & (sizeof(uint32_t)-1)) != 0) {
+	if (unlikely((uintptr_t)p & (sizeof(uint32_t) - 1)) != 0) {
 		uint32_t ret;

 		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint32_t));
@ -37,7 +38,7 @@ hash_get_block_32(const uint32_t *p, int i) {
 static inline uint64_t
 hash_get_block_64(const uint64_t *p, int i) {
 	/* Handle unaligned read. */
-	if (unlikely((uintptr_t)p & (sizeof(uint64_t)-1)) != 0) {
+	if (unlikely((uintptr_t)p & (sizeof(uint64_t) - 1)) != 0) {
 		uint64_t ret;

 		memcpy(&ret, (uint8_t *)(p + i), sizeof(uint64_t));
@ -71,8 +72,8 @@ hash_fmix_64(uint64_t k) {

 static inline uint32_t
 hash_x86_32(const void *key, int len, uint32_t seed) {
-	const uint8_t *data = (const uint8_t *) key;
-	const int nblocks = len / 4;
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 4;

 	uint32_t h1 = seed;

@ -81,8 +82,8 @@ hash_x86_32(const void *key, int len, uint32_t seed) {

 	/* body */
 	{
-		const uint32_t *blocks = (const uint32_t *) (data + nblocks*4);
-		int i;
+		const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4);
+		int             i;

 		for (i = -nblocks; i; i++) {
 			uint32_t k1 = hash_get_block_32(blocks, i);
@ -93,21 +94,29 @@ hash_x86_32(const void *key, int len, uint32_t seed) {

 			h1 ^= k1;
 			h1 = hash_rotl_32(h1, 13);
-			h1 = h1*5 + 0xe6546b64;
+			h1 = h1 * 5 + 0xe6546b64;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t *) (data + nblocks*4);
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 4);

 		uint32_t k1 = 0;

 		switch (len & 3) {
-		case 3: k1 ^= tail[2] << 16; JEMALLOC_FALLTHROUGH;
-		case 2: k1 ^= tail[1] << 8; JEMALLOC_FALLTHROUGH;
-		case 1: k1 ^= tail[0]; k1 *= c1; k1 = hash_rotl_32(k1, 15);
-			k1 *= c2; h1 ^= k1;
+		case 3:
+			k1 ^= tail[2] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= tail[1] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= tail[0];
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;
 		}
 	}

@ -120,10 +129,9 @@ hash_x86_32(const void *key, int len, uint32_t seed) {
 }

 static inline void
-hash_x86_128(const void *key, const int len, uint32_t seed,
-    uint64_t r_out[2]) {
-	const uint8_t * data = (const uint8_t *) key;
-	const int nblocks = len / 16;
+hash_x86_128(const void *key, const int len, uint32_t seed, uint64_t r_out[2]) {
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 16;

 	uint32_t h1 = seed;
 	uint32_t h2 = seed;
@ -137,95 +145,161 @@ hash_x86_128(const void *key, const int len, uint32_t seed,

 	/* body */
 	{
-		const uint32_t *blocks = (const uint32_t *) (data + nblocks*16);
-		int i;
+		const uint32_t *blocks = (const uint32_t *)(data
+		    + nblocks * 16);
+		int             i;

 		for (i = -nblocks; i; i++) {
-			uint32_t k1 = hash_get_block_32(blocks, i*4 + 0);
-			uint32_t k2 = hash_get_block_32(blocks, i*4 + 1);
-			uint32_t k3 = hash_get_block_32(blocks, i*4 + 2);
-			uint32_t k4 = hash_get_block_32(blocks, i*4 + 3);
+			uint32_t k1 = hash_get_block_32(blocks, i * 4 + 0);
+			uint32_t k2 = hash_get_block_32(blocks, i * 4 + 1);
+			uint32_t k3 = hash_get_block_32(blocks, i * 4 + 2);
+			uint32_t k4 = hash_get_block_32(blocks, i * 4 + 3);

-			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;

-			h1 = hash_rotl_32(h1, 19); h1 += h2;
-			h1 = h1*5 + 0x561ccd1b;
+			h1 = hash_rotl_32(h1, 19);
+			h1 += h2;
+			h1 = h1 * 5 + 0x561ccd1b;

-			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
+			k2 *= c2;
+			k2 = hash_rotl_32(k2, 16);
+			k2 *= c3;
+			h2 ^= k2;

-			h2 = hash_rotl_32(h2, 17); h2 += h3;
-			h2 = h2*5 + 0x0bcaa747;
+			h2 = hash_rotl_32(h2, 17);
+			h2 += h3;
+			h2 = h2 * 5 + 0x0bcaa747;

-			k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
+			k3 *= c3;
+			k3 = hash_rotl_32(k3, 17);
+			k3 *= c4;
+			h3 ^= k3;

-			h3 = hash_rotl_32(h3, 15); h3 += h4;
-			h3 = h3*5 + 0x96cd1c35;
+			h3 = hash_rotl_32(h3, 15);
+			h3 += h4;
+			h3 = h3 * 5 + 0x96cd1c35;

-			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
+			k4 *= c4;
+			k4 = hash_rotl_32(k4, 18);
+			k4 *= c1;
+			h4 ^= k4;

-			h4 = hash_rotl_32(h4, 13); h4 += h1;
-			h4 = h4*5 + 0x32ac3b17;
+			h4 = hash_rotl_32(h4, 13);
+			h4 += h1;
+			h4 = h4 * 5 + 0x32ac3b17;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t *) (data + nblocks*16);
-		uint32_t k1 = 0;
-		uint32_t k2 = 0;
-		uint32_t k3 = 0;
-		uint32_t k4 = 0;
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
+		uint32_t       k1 = 0;
+		uint32_t       k2 = 0;
+		uint32_t       k3 = 0;
+		uint32_t       k4 = 0;

 		switch (len & 15) {
-		case 15: k4 ^= tail[14] << 16; JEMALLOC_FALLTHROUGH;
-		case 14: k4 ^= tail[13] << 8; JEMALLOC_FALLTHROUGH;
-		case 13: k4 ^= tail[12] << 0;
-			k4 *= c4; k4 = hash_rotl_32(k4, 18); k4 *= c1; h4 ^= k4;
+		case 15:
+			k4 ^= tail[14] << 16;
 			JEMALLOC_FALLTHROUGH;
-		case 12: k3 ^= (uint32_t) tail[11] << 24; JEMALLOC_FALLTHROUGH;
-		case 11: k3 ^= tail[10] << 16; JEMALLOC_FALLTHROUGH;
-		case 10: k3 ^= tail[ 9] << 8; JEMALLOC_FALLTHROUGH;
-		case  9: k3 ^= tail[ 8] << 0;
-			k3 *= c3; k3 = hash_rotl_32(k3, 17); k3 *= c4; h3 ^= k3;
+		case 14:
+			k4 ^= tail[13] << 8;
 			JEMALLOC_FALLTHROUGH;
-		case  8: k2 ^= (uint32_t) tail[ 7] << 24; JEMALLOC_FALLTHROUGH;
-		case  7: k2 ^= tail[ 6] << 16; JEMALLOC_FALLTHROUGH;
-		case  6: k2 ^= tail[ 5] << 8; JEMALLOC_FALLTHROUGH;
-		case  5: k2 ^= tail[ 4] << 0;
-			k2 *= c2; k2 = hash_rotl_32(k2, 16); k2 *= c3; h2 ^= k2;
+		case 13:
+			k4 ^= tail[12] << 0;
+			k4 *= c4;
+			k4 = hash_rotl_32(k4, 18);
+			k4 *= c1;
+			h4 ^= k4;
 			JEMALLOC_FALLTHROUGH;
-		case  4: k1 ^= (uint32_t) tail[ 3] << 24; JEMALLOC_FALLTHROUGH;
-		case  3: k1 ^= tail[ 2] << 16; JEMALLOC_FALLTHROUGH;
-		case  2: k1 ^= tail[ 1] << 8; JEMALLOC_FALLTHROUGH;
-		case  1: k1 ^= tail[ 0] << 0;
-			k1 *= c1; k1 = hash_rotl_32(k1, 15); k1 *= c2; h1 ^= k1;
+		case 12:
+			k3 ^= (uint32_t)tail[11] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 11:
+			k3 ^= tail[10] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 10:
+			k3 ^= tail[9] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 9:
+			k3 ^= tail[8] << 0;
+			k3 *= c3;
+			k3 = hash_rotl_32(k3, 17);
+			k3 *= c4;
+			h3 ^= k3;
+			JEMALLOC_FALLTHROUGH;
+		case 8:
+			k2 ^= (uint32_t)tail[7] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 7:
+			k2 ^= tail[6] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 6:
+			k2 ^= tail[5] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 5:
+			k2 ^= tail[4] << 0;
+			k2 *= c2;
+			k2 = hash_rotl_32(k2, 16);
+			k2 *= c3;
+			h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case 4:
+			k1 ^= (uint32_t)tail[3] << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 3:
+			k1 ^= tail[2] << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= tail[1] << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= tail[0] << 0;
+			k1 *= c1;
+			k1 = hash_rotl_32(k1, 15);
+			k1 *= c2;
+			h1 ^= k1;
 			break;
 		}
 	}

 	/* finalization */
-	h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
+	h1 ^= len;
+	h2 ^= len;
+	h3 ^= len;
+	h4 ^= len;

-	h1 += h2; h1 += h3; h1 += h4;
-	h2 += h1; h3 += h1; h4 += h1;
+	h1 += h2;
+	h1 += h3;
+	h1 += h4;
+	h2 += h1;
+	h3 += h1;
+	h4 += h1;

 	h1 = hash_fmix_32(h1);
 	h2 = hash_fmix_32(h2);
 	h3 = hash_fmix_32(h3);
 	h4 = hash_fmix_32(h4);

-	h1 += h2; h1 += h3; h1 += h4;
-	h2 += h1; h3 += h1; h4 += h1;
+	h1 += h2;
+	h1 += h3;
+	h1 += h4;
+	h2 += h1;
+	h3 += h1;
+	h4 += h1;

-	r_out[0] = (((uint64_t) h2) << 32) | h1;
-	r_out[1] = (((uint64_t) h4) << 32) | h3;
+	r_out[0] = (((uint64_t)h2) << 32) | h1;
+	r_out[1] = (((uint64_t)h4) << 32) | h3;
 }

 static inline void
-hash_x64_128(const void *key, const int len, const uint32_t seed,
-    uint64_t r_out[2]) {
-	const uint8_t *data = (const uint8_t *) key;
-	const int nblocks = len / 16;
+hash_x64_128(
+    const void *key, const int len, const uint32_t seed, uint64_t r_out[2]) {
+	const uint8_t *data = (const uint8_t *)key;
+	const int      nblocks = len / 16;

 	uint64_t h1 = seed;
 	uint64_t h2 = seed;
@ -235,56 +309,99 @@ hash_x64_128(const void *key, const int len, const uint32_t seed,

 	/* body */
 	{
-		const uint64_t *blocks = (const uint64_t *) (data);
-		int i;
+		const uint64_t *blocks = (const uint64_t *)(data);
+		int             i;

 		for (i = 0; i < nblocks; i++) {
-			uint64_t k1 = hash_get_block_64(blocks, i*2 + 0);
-			uint64_t k2 = hash_get_block_64(blocks, i*2 + 1);
+			uint64_t k1 = hash_get_block_64(blocks, i * 2 + 0);
+			uint64_t k2 = hash_get_block_64(blocks, i * 2 + 1);

-			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+			k1 *= c1;
+			k1 = hash_rotl_64(k1, 31);
+			k1 *= c2;
+			h1 ^= k1;

-			h1 = hash_rotl_64(h1, 27); h1 += h2;
-			h1 = h1*5 + 0x52dce729;
+			h1 = hash_rotl_64(h1, 27);
+			h1 += h2;
+			h1 = h1 * 5 + 0x52dce729;

-			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
+			k2 *= c2;
+			k2 = hash_rotl_64(k2, 33);
+			k2 *= c1;
+			h2 ^= k2;

-			h2 = hash_rotl_64(h2, 31); h2 += h1;
-			h2 = h2*5 + 0x38495ab5;
+			h2 = hash_rotl_64(h2, 31);
+			h2 += h1;
+			h2 = h2 * 5 + 0x38495ab5;
 		}
 	}

 	/* tail */
 	{
-		const uint8_t *tail = (const uint8_t*)(data + nblocks*16);
-		uint64_t k1 = 0;
-		uint64_t k2 = 0;
+		const uint8_t *tail = (const uint8_t *)(data + nblocks * 16);
+		uint64_t       k1 = 0;
+		uint64_t       k2 = 0;

 		switch (len & 15) {
-		case 15: k2 ^= ((uint64_t)(tail[14])) << 48; JEMALLOC_FALLTHROUGH;
-		case 14: k2 ^= ((uint64_t)(tail[13])) << 40; JEMALLOC_FALLTHROUGH;
-		case 13: k2 ^= ((uint64_t)(tail[12])) << 32; JEMALLOC_FALLTHROUGH;
-		case 12: k2 ^= ((uint64_t)(tail[11])) << 24; JEMALLOC_FALLTHROUGH;
-		case 11: k2 ^= ((uint64_t)(tail[10])) << 16; JEMALLOC_FALLTHROUGH;
-		case 10: k2 ^= ((uint64_t)(tail[ 9])) << 8;  JEMALLOC_FALLTHROUGH;
-		case  9: k2 ^= ((uint64_t)(tail[ 8])) << 0;
-			k2 *= c2; k2 = hash_rotl_64(k2, 33); k2 *= c1; h2 ^= k2;
+		case 15:
+			k2 ^= ((uint64_t)(tail[14])) << 48;
 			JEMALLOC_FALLTHROUGH;
-		case  8: k1 ^= ((uint64_t)(tail[ 7])) << 56; JEMALLOC_FALLTHROUGH;
-		case  7: k1 ^= ((uint64_t)(tail[ 6])) << 48; JEMALLOC_FALLTHROUGH;
-		case  6: k1 ^= ((uint64_t)(tail[ 5])) << 40; JEMALLOC_FALLTHROUGH;
-		case  5: k1 ^= ((uint64_t)(tail[ 4])) << 32; JEMALLOC_FALLTHROUGH;
-		case  4: k1 ^= ((uint64_t)(tail[ 3])) << 24; JEMALLOC_FALLTHROUGH;
-		case  3: k1 ^= ((uint64_t)(tail[ 2])) << 16; JEMALLOC_FALLTHROUGH;
-		case  2: k1 ^= ((uint64_t)(tail[ 1])) << 8;  JEMALLOC_FALLTHROUGH;
-		case  1: k1 ^= ((uint64_t)(tail[ 0])) << 0;
-			k1 *= c1; k1 = hash_rotl_64(k1, 31); k1 *= c2; h1 ^= k1;
+		case 14:
+			k2 ^= ((uint64_t)(tail[13])) << 40;
+			JEMALLOC_FALLTHROUGH;
+		case 13:
+			k2 ^= ((uint64_t)(tail[12])) << 32;
+			JEMALLOC_FALLTHROUGH;
+		case 12:
+			k2 ^= ((uint64_t)(tail[11])) << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 11:
+			k2 ^= ((uint64_t)(tail[10])) << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 10:
+			k2 ^= ((uint64_t)(tail[9])) << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 9:
+			k2 ^= ((uint64_t)(tail[8])) << 0;
+			k2 *= c2;
+			k2 = hash_rotl_64(k2, 33);
+			k2 *= c1;
+			h2 ^= k2;
+			JEMALLOC_FALLTHROUGH;
+		case 8:
+			k1 ^= ((uint64_t)(tail[7])) << 56;
+			JEMALLOC_FALLTHROUGH;
+		case 7:
+			k1 ^= ((uint64_t)(tail[6])) << 48;
+			JEMALLOC_FALLTHROUGH;
+		case 6:
+			k1 ^= ((uint64_t)(tail[5])) << 40;
+			JEMALLOC_FALLTHROUGH;
+		case 5:
+			k1 ^= ((uint64_t)(tail[4])) << 32;
+			JEMALLOC_FALLTHROUGH;
+		case 4:
+			k1 ^= ((uint64_t)(tail[3])) << 24;
+			JEMALLOC_FALLTHROUGH;
+		case 3:
+			k1 ^= ((uint64_t)(tail[2])) << 16;
+			JEMALLOC_FALLTHROUGH;
+		case 2:
+			k1 ^= ((uint64_t)(tail[1])) << 8;
+			JEMALLOC_FALLTHROUGH;
+		case 1:
+			k1 ^= ((uint64_t)(tail[0])) << 0;
+			k1 *= c1;
+			k1 = hash_rotl_64(k1, 31);
+			k1 *= c2;
+			h1 ^= k1;
 			break;
 		}
 	}

 	/* finalization */
-	h1 ^= len; h2 ^= len;
+	h1 ^= len;
+	h2 ^= len;

 	h1 += h2;
 	h2 += h1;
--- a/include/jemalloc/internal/hook.h
+++ b/include/jemalloc/internal/hook.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_HOOK_H
 #define JEMALLOC_INTERNAL_HOOK_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/tsd.h"

 /*
@ -55,6 +56,7 @@ enum hook_alloc_e {
 	hook_alloc_calloc,
 	hook_alloc_memalign,
 	hook_alloc_valloc,
+	hook_alloc_pvalloc,
 	hook_alloc_mallocx,

 	/* The reallocating functions have both alloc and dalloc variants */
@ -81,7 +83,6 @@ enum hook_dalloc_e {
 };
 typedef enum hook_dalloc_e hook_dalloc_t;

-
 enum hook_expand_e {
 	hook_expand_realloc,
 	hook_expand_rallocx,
@ -89,23 +90,22 @@ enum hook_expand_e {
 };
 typedef enum hook_expand_e hook_expand_t;

-typedef void (*hook_alloc)(
-    void *extra, hook_alloc_t type, void *result, uintptr_t result_raw,
-    uintptr_t args_raw[3]);
+typedef void (*hook_alloc)(void *extra, hook_alloc_t type, void *result,
+    uintptr_t result_raw, uintptr_t args_raw[3]);

 typedef void (*hook_dalloc)(
    void *extra, hook_dalloc_t type, void *address, uintptr_t args_raw[3]);

-typedef void (*hook_expand)(
-    void *extra, hook_expand_t type, void *address, size_t old_usize,
-    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
+typedef void (*hook_expand)(void *extra, hook_expand_t type, void *address,
+    size_t old_usize, size_t new_usize, uintptr_t result_raw,
+    uintptr_t args_raw[4]);

 typedef struct hooks_s hooks_t;
 struct hooks_s {
-	hook_alloc alloc_hook;
+	hook_alloc  alloc_hook;
 	hook_dalloc dalloc_hook;
 	hook_expand expand_hook;
-	void *extra;
+	void       *extra;
 };

 /*
@ -143,9 +143,9 @@ struct hook_ralloc_args_s {
 * Returns an opaque handle to be used when removing the hook.  NULL means that
 * we couldn't install the hook.
 */
-bool hook_boot();
+bool hook_boot(void);

-void *hook_install(tsdn_t *tsdn, hooks_t *hooks);
+void *hook_install(tsdn_t *tsdn, hooks_t *to_install);
 /* Uninstalls the hook with the handle previously returned from hook_install. */
 void hook_remove(tsdn_t *tsdn, void *opaque);

@ -154,8 +154,8 @@ void hook_remove(tsdn_t *tsdn, void *opaque);
 void hook_invoke_alloc(hook_alloc_t type, void *result, uintptr_t result_raw,
    uintptr_t args_raw[3]);

-void hook_invoke_dalloc(hook_dalloc_t type, void *address,
-    uintptr_t args_raw[3]);
+void hook_invoke_dalloc(
+    hook_dalloc_t type, void *address, uintptr_t args_raw[3]);

 void hook_invoke_expand(hook_expand_t type, void *address, size_t old_usize,
    size_t new_usize, uintptr_t result_raw, uintptr_t args_raw[4]);
--- a/include/jemalloc/internal/hpa.h
+++ b/include/jemalloc/internal/hpa.h
@ -1,42 +1,18 @@
 #ifndef JEMALLOC_INTERNAL_HPA_H
 #define JEMALLOC_INTERNAL_HPA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/edata_cache.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/exp_grow.h"
+#include "jemalloc/internal/hpa_central.h"
 #include "jemalloc/internal/hpa_hooks.h"
 #include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/mutex.h"
 #include "jemalloc/internal/pai.h"
 #include "jemalloc/internal/psset.h"
-
-typedef struct hpa_central_s hpa_central_t;
-struct hpa_central_s {
-	/*
-	 * The mutex guarding most of the operations on the central data
-	 * structure.
-	 */
-	malloc_mutex_t mtx;
-	/*
-	 * Guards expansion of eden.  We separate this from the regular mutex so
-	 * that cheaper operations can still continue while we're doing the OS
-	 * call.
-	 */
-	malloc_mutex_t grow_mtx;
-	/*
-	 * Either NULL (if empty), or some integer multiple of a
-	 * hugepage-aligned number of hugepages.  We carve them off one at a
-	 * time to satisfy new pageslab requests.
-	 *
-	 * Guarded by grow_mtx.
-	 */
-	void *eden;
-	size_t eden_len;
-	/* Source for metadata. */
-	base_t *base;
-	/* Number of grow operations done on this hpa_central_t. */
-	uint64_t age_counter;
-
-	/* The HPA hooks. */
-	hpa_hooks_t hooks;
-};
+#include "jemalloc/internal/sec.h"

 typedef struct hpa_shard_nonderived_stats_s hpa_shard_nonderived_stats_t;
 struct hpa_shard_nonderived_stats_s {
@ -61,6 +37,14 @@ struct hpa_shard_nonderived_stats_s {
 	 * Guarded by mtx.
 	 */
 	uint64_t nhugifies;
+
+	/*
+	 * The number of times we've tried to hugify a pageslab, but failed.
+	 *
+	 * Guarded by mtx.
+	 */
+	uint64_t nhugify_failures;
+
 	/*
 	 * The number of times we've dehugified a pageslab.
 	 *
@ -72,8 +56,9 @@ struct hpa_shard_nonderived_stats_s {
 /* Completely derived; only used by CTL. */
 typedef struct hpa_shard_stats_s hpa_shard_stats_t;
 struct hpa_shard_stats_s {
-	psset_stats_t psset_stats;
+	psset_stats_t                psset_stats;
 	hpa_shard_nonderived_stats_t nonderived_stats;
+	sec_stats_t                  secstats;
 };

 typedef struct hpa_shard_s hpa_shard_t;
@ -86,14 +71,17 @@ struct hpa_shard_s {

 	/* The central allocator we get our hugepages from. */
 	hpa_central_t *central;
+
 	/* Protects most of this shard's state. */
 	malloc_mutex_t mtx;
+
 	/*
 	 * Guards the shard's access to the central allocator (preventing
 	 * multiple threads operating on this shard from accessing the central
 	 * allocator).
 	 */
 	malloc_mutex_t grow_mtx;
+
 	/* The base metadata allocator. */
 	base_t *base;

@ -104,6 +92,9 @@ struct hpa_shard_s {
 	 */
 	edata_cache_fast_t ecf;

+	/* Small extent cache (not guarded by mtx) */
+	JEMALLOC_ALIGNED(CACHELINE) sec_t sec;
+
 	psset_t psset;

 	/*
@ -141,22 +132,31 @@ struct hpa_shard_s {
 	 * Last time we performed purge on this shard.
 	 */
 	nstime_t last_purge;
+
+	/*
+	 * Last time when we attempted work (purging or hugifying). If deferral
+	 * of the work is allowed (we have background thread), this is the time
+	 * when background thread checked if purging or hugifying needs to be
+	 * done. If deferral is not allowed, this is the time of (hpa_alloc or
+	 * hpa_dalloc) activity in the shard.
+	 */
+	nstime_t last_time_work_attempted;
 };

+bool hpa_hugepage_size_exceeds_limit(void);
 /*
 * Whether or not the HPA can be used given the current configuration.  This is
 * is not necessarily a guarantee that it backs its allocations by hugepages,
 * just that it can function properly given the system it's running on.
 */
-bool hpa_supported();
-bool hpa_central_init(hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
-bool hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
-    base_t *base, edata_cache_t *edata_cache, unsigned ind,
-    const hpa_shard_opts_t *opts);
+bool hpa_supported(void);
+bool hpa_shard_init(tsdn_t *tsdn, hpa_shard_t *shard, hpa_central_t *central,
+    emap_t *emap, base_t *base, edata_cache_t *edata_cache, unsigned ind,
+    const hpa_shard_opts_t *opts, const sec_opts_t *sec_opts);

 void hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src);
-void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
-    hpa_shard_stats_t *dst);
+void hpa_shard_stats_merge(
+    tsdn_t *tsdn, hpa_shard_t *shard, hpa_shard_stats_t *dst);

 /*
 * Notify the shard that we won't use it for allocations much longer.  Due to
@ -165,15 +165,18 @@ void hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
 */
 void hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard);
+/* Flush caches that shard may be using */
+void hpa_shard_flush(tsdn_t *tsdn, hpa_shard_t *shard);

-void hpa_shard_set_deferral_allowed(tsdn_t *tsdn, hpa_shard_t *shard,
-    bool deferral_allowed);
+void hpa_shard_set_deferral_allowed(
+    tsdn_t *tsdn, hpa_shard_t *shard, bool deferral_allowed);
 void hpa_shard_do_deferred_work(tsdn_t *tsdn, hpa_shard_t *shard);

 /*
 * We share the fork ordering with the PA and arena prefork handling; that's why
- * these are 3 and 4 rather than 0 and 1.
+ * these are 2, 3 and 4 rather than 0 and 1.
 */
+void hpa_shard_prefork2(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard);
 void hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard);
--- a/include/jemalloc/internal/hpa_central.h
+++ b/include/jemalloc/internal/hpa_central.h
@ -0,0 +1,41 @@
+#ifndef JEMALLOC_INTERNAL_HPA_CENTRAL_H
+#define JEMALLOC_INTERNAL_HPA_CENTRAL_H
+
+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/base.h"
+#include "jemalloc/internal/hpa_hooks.h"
+#include "jemalloc/internal/hpdata.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd_types.h"
+
+typedef struct hpa_central_s hpa_central_t;
+struct hpa_central_s {
+	/*
+	 * Guards expansion of eden.  We separate this from the regular mutex so
+	 * that cheaper operations can still continue while we're doing the OS
+	 * call.
+	 */
+	malloc_mutex_t grow_mtx;
+	/*
+	 * Either NULL (if empty), or some integer multiple of a
+	 * hugepage-aligned number of hugepages.  We carve them off one at a
+	 * time to satisfy new pageslab requests.
+	 *
+	 * Guarded by grow_mtx.
+	 */
+	void  *eden;
+	size_t eden_len;
+	/* Source for metadata. */
+	base_t *base;
+
+	/* The HPA hooks. */
+	hpa_hooks_t hooks;
+};
+
+bool hpa_central_init(
+    hpa_central_t *central, base_t *base, const hpa_hooks_t *hooks);
+
+hpdata_t *hpa_central_extract(tsdn_t *tsdn, hpa_central_t *central, size_t size,
+    uint64_t age, bool hugify_eager, bool *oom);
+
+#endif /* JEMALLOC_INTERNAL_HPA_CENTRAL_H */
--- a/include/jemalloc/internal/hpa_hooks.h
+++ b/include/jemalloc/internal/hpa_hooks.h
@ -1,17 +1,21 @@
 #ifndef JEMALLOC_INTERNAL_HPA_HOOKS_H
 #define JEMALLOC_INTERNAL_HPA_HOOKS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/nstime.h"
+
 typedef struct hpa_hooks_s hpa_hooks_t;
 struct hpa_hooks_s {
 	void *(*map)(size_t size);
 	void (*unmap)(void *ptr, size_t size);
 	void (*purge)(void *ptr, size_t size);
-	void (*hugify)(void *ptr, size_t size);
+	bool (*hugify)(void *ptr, size_t size, bool sync);
 	void (*dehugify)(void *ptr, size_t size);
 	void (*curtime)(nstime_t *r_time, bool first_reading);
 	uint64_t (*ms_since)(nstime_t *r_time);
+	bool (*vectorized_purge)(void *vec, size_t vlen, size_t nbytes);
 };

-extern hpa_hooks_t hpa_hooks_default;
+extern const hpa_hooks_t hpa_hooks_default;

 #endif /* JEMALLOC_INTERNAL_HPA_HOOKS_H */
--- a/include/jemalloc/internal/hpa_opts.h
+++ b/include/jemalloc/internal/hpa_opts.h
@ -1,13 +1,66 @@
 #ifndef JEMALLOC_INTERNAL_HPA_OPTS_H
 #define JEMALLOC_INTERNAL_HPA_OPTS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/fxp.h"

 /*
 * This file is morally part of hpa.h, but is split out for header-ordering
 * reasons.
+ *
+ * All of these hpa_shard_opts below are experimental. We are exploring more
+ * efficient packing, hugifying, and purging approaches to make efficient
+ * trade-offs between CPU, memory, latency, and usability. This means all of
+ * them are at the risk of being deprecated and corresponding configurations
+ * should be updated once the final version settles.
 */

+/*
+ * This enum controls how jemalloc hugifies/dehugifies pages.  Each style may be
+ * more suitable depending on deployment environments.
+ *
+ * hpa_hugify_style_none
+ * Using this means that jemalloc will not be hugifying or dehugifying pages,
+ * but will let the kernel make those decisions.  This style only makes sense
+ * when deploying on systems where THP are enabled in 'always' mode.  With this
+ * style, you most likely want to have no purging at all (dirty_mult=-1) or
+ * purge_threshold=HUGEPAGE bytes (2097152 for 2Mb page), although other
+ * thresholds may work well depending on kernel settings of your deployment
+ * targets.
+ *
+ * hpa_hugify_style_eager
+ * This style results in jemalloc giving hugepage advice, if needed, to
+ * anonymous memory immediately after it is mapped, so huge pages can be backing
+ * that memory at page-fault time.  This is usually more efficient than doing
+ * it later, and it allows us to benefit from the hugepages from the start.
+ * Same options for purging as for the style 'none' are good starting choices:
+ * no purging, or purge_threshold=HUGEPAGE, some min_purge_delay_ms that allows
+ * for page not to be purged quickly, etc.  This is a good choice if you can
+ * afford extra memory and your application gets performance increase from
+ * transparent hughepages.
+ *
+ * hpa_hugify_style_lazy
+ * This style is suitable when you purge more aggressively (you sacrifice CPU
+ * performance for less memory).  When this style is chosen, jemalloc will
+ * hugify once hugification_threshold is reached, and dehugify before purging.
+ * If the kernel is configured to use direct compaction you may experience some
+ * allocation latency when using this style.  The best is to measure what works
+ * better for your application needs, and in the target deployment environment.
+ * This is a good choice for apps that cannot afford a lot of memory regression,
+ * but would still like to benefit from backing certain memory regions with
+ * hugepages.
+ */
+enum hpa_hugify_style_e {
+	hpa_hugify_style_auto = 0,
+	hpa_hugify_style_none = 1,
+	hpa_hugify_style_eager = 2,
+	hpa_hugify_style_lazy = 3,
+	hpa_hugify_style_limit = hpa_hugify_style_lazy + 1
+};
+typedef enum hpa_hugify_style_e hpa_hugify_style_t;
+
+extern const char *const hpa_hugify_style_names[];
+
 typedef struct hpa_shard_opts_s hpa_shard_opts_t;
 struct hpa_shard_opts_s {
 	/*
@ -44,12 +97,64 @@ struct hpa_shard_opts_s {
 	 */
 	uint64_t hugify_delay_ms;

+	/*
+	 * Hugify pages synchronously (hugify will happen even if hugify_style
+	 * is not hpa_hugify_style_lazy).
+	 */
+	bool hugify_sync;
+
 	/*
 	 * Minimum amount of time between purges.
 	 */
 	uint64_t min_purge_interval_ms;
+
+	/*
+	 * Maximum number of hugepages to purge on each purging attempt.
+	 */
+	ssize_t experimental_max_purge_nhp;
+
+	/*
+	 * Minimum number of inactive bytes needed for a non-empty page to be
+	 * considered purgable.
+	 *
+	 * When the number of touched inactive bytes on non-empty hugepage is
+	 * >= purge_threshold, the page is purgable.  Empty pages are always
+	 * purgable.  Setting this to HUGEPAGE bytes would only purge empty
+	 * pages if using hugify_style_eager and the purges would be exactly
+	 * HUGEPAGE bytes.  Depending on your kernel settings, this may result
+	 * in better performance.
+	 *
+	 * Please note, when threshold is reached, we will purge all the dirty
+	 * bytes, and not just up to the threshold.  If this is PAGE bytes, then
+	 * all the pages that have any dirty bytes are purgable.  We treat
+	 * purgability constraint for purge_threshold as stronger than
+	 * dirty_mult, IOW, if no page meets purge_threshold, we will not purge
+	 * even if we are above dirty_mult.
+	 */
+	size_t purge_threshold;
+
+	/*
+	 * Minimum number of ms that needs to elapse between HP page becoming
+	 * eligible for purging and actually getting purged.
+	 *
+	 * Setting this to a larger number would give better chance of reusing
+	 * that memory.  Setting it to 0 means that page is eligible for purging
+	 * as soon as it meets the purge_threshold.  The clock resets when
+	 * purgability of the page changes (page goes from being non-purgable to
+	 * purgable).  When using eager style you probably want to allow for
+	 * some delay, to avoid purging the page too quickly and give it time to
+	 * be used.
+	 */
+	uint64_t min_purge_delay_ms;
+
+	/*
+	 * Style of hugification/dehugification (see comment at
+	 * hpa_hugify_style_t for options).
+	 */
+	hpa_hugify_style_t hugify_style;
 };

+/* clang-format off */
 #define HPA_SHARD_OPTS_DEFAULT {					\
 	/* slab_max_alloc */						\
 	64 * 1024,							\
@ -67,8 +172,19 @@ struct hpa_shard_opts_s {
 	false,								\
 	/* hugify_delay_ms */						\
 	10 * 1000,							\
+	/* hugify_sync */						\
+	false,								\
 	/* min_purge_interval_ms */					\
-	5 * 1000							\
+	5 * 1000,							\
+	/* experimental_max_purge_nhp */				\
+	-1,      							\
+	/* size_t purge_threshold */					\
+	PAGE,								\
+	/* min_purge_delay_ms */             				\
+	0,  								\
+	/* hugify_style */                				\
+	hpa_hugify_style_lazy						\
 }
+/* clang-format on */

 #endif /* JEMALLOC_INTERNAL_HPA_OPTS_H */
--- a/include/jemalloc/internal/hpa_utils.h
+++ b/include/jemalloc/internal/hpa_utils.h
@ -0,0 +1,161 @@
+#ifndef JEMALLOC_INTERNAL_HPA_UTILS_H
+#define JEMALLOC_INTERNAL_HPA_UTILS_H
+
+#include "jemalloc/internal/hpa.h"
+#include "jemalloc/internal/extent.h"
+
+#define HPA_MIN_VAR_VEC_SIZE 8
+/*
+ * This is used for jemalloc internal tuning and may change in the future based
+ * on production traffic.
+ *
+ * This value protects two things:
+ *    1. Stack size
+ *    2. Number of huge pages that are being purged in a batch as we do not
+ *       allow allocations while making madvise syscall.
+ */
+#define HPA_PURGE_BATCH_MAX 16
+
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+typedef struct iovec hpa_io_vector_t;
+#else
+typedef struct {
+	void  *iov_base;
+	size_t iov_len;
+} hpa_io_vector_t;
+#endif
+
+static inline size_t
+hpa_process_madvise_max_iovec_len(void) {
+	assert(
+	    opt_process_madvise_max_batch <= PROCESS_MADVISE_MAX_BATCH_LIMIT);
+	return opt_process_madvise_max_batch == 0
+	    ? HPA_MIN_VAR_VEC_SIZE
+	    : opt_process_madvise_max_batch;
+}
+
+/* Actually invoke hooks. If we fail vectorized, use single purges */
+static void
+hpa_try_vectorized_purge(
+    hpa_hooks_t *hooks, hpa_io_vector_t *vec, size_t vlen, size_t nbytes) {
+	bool success = opt_process_madvise_max_batch > 0
+	    && !hooks->vectorized_purge(vec, vlen, nbytes);
+	if (!success) {
+		/* On failure, it is safe to purge again (potential perf
+		 * penalty) If kernel can tell exactly which regions
+		 * failed, we could avoid that penalty.
+		 */
+		for (size_t i = 0; i < vlen; ++i) {
+			hooks->purge(vec[i].iov_base, vec[i].iov_len);
+		}
+	}
+}
+
+/*
+ * This structure accumulates the regions for process_madvise. It invokes the
+ * hook when batch limit is reached.
+ */
+typedef struct {
+	hpa_io_vector_t *vp;
+	size_t           cur;
+	size_t           total_bytes;
+	size_t           capacity;
+} hpa_range_accum_t;
+
+static inline void
+hpa_range_accum_init(hpa_range_accum_t *ra, hpa_io_vector_t *v, size_t sz) {
+	ra->vp = v;
+	ra->capacity = sz;
+	ra->total_bytes = 0;
+	ra->cur = 0;
+}
+
+static inline void
+hpa_range_accum_flush(hpa_range_accum_t *ra, hpa_hooks_t *hooks) {
+	assert(ra->total_bytes > 0 && ra->cur > 0);
+	hpa_try_vectorized_purge(hooks, ra->vp, ra->cur, ra->total_bytes);
+	ra->cur = 0;
+	ra->total_bytes = 0;
+}
+
+static inline void
+hpa_range_accum_add(
+    hpa_range_accum_t *ra, void *addr, size_t sz, hpa_hooks_t *hooks) {
+	assert(ra->cur < ra->capacity);
+
+	ra->vp[ra->cur].iov_base = addr;
+	ra->vp[ra->cur].iov_len = sz;
+	ra->total_bytes += sz;
+	ra->cur++;
+
+	if (ra->cur == ra->capacity) {
+		hpa_range_accum_flush(ra, hooks);
+	}
+}
+
+static inline void
+hpa_range_accum_finish(hpa_range_accum_t *ra, hpa_hooks_t *hooks) {
+	if (ra->cur > 0) {
+		hpa_range_accum_flush(ra, hooks);
+	}
+}
+
+/*
+ * For purging more than one page we use batch of these items
+ */
+typedef struct {
+	hpdata_purge_state_t state;
+	hpdata_t            *hp;
+	bool                 dehugify;
+} hpa_purge_item_t;
+
+typedef struct hpa_purge_batch_s hpa_purge_batch_t;
+struct hpa_purge_batch_s {
+	hpa_purge_item_t *items;
+	size_t            items_capacity;
+	/* Number of huge pages to purge in current batch */
+	size_t item_cnt;
+	/* Number of ranges to purge in current batch */
+	size_t nranges;
+	/* Total number of dirty pages in current batch*/
+	size_t ndirty_in_batch;
+
+	/* Max number of huge pages to purge */
+	size_t max_hp;
+	/*
+	 * Once we are above this watermark we should not add more pages
+	 * to the same batch. This is because while we want to minimize
+	 * number of madvise calls we also do not want to be preventing
+	 * allocations from too many huge pages (which we have to do
+	 * while they are being purged)
+	 */
+	size_t range_watermark;
+
+	size_t npurged_hp_total;
+};
+
+static inline bool
+hpa_batch_full(hpa_purge_batch_t *b) {
+	/* It's okay for ranges to go above */
+	return b->npurged_hp_total == b->max_hp
+	    || b->item_cnt == b->items_capacity
+	    || b->nranges >= b->range_watermark;
+}
+
+static inline void
+hpa_batch_pass_start(hpa_purge_batch_t *b) {
+	b->item_cnt = 0;
+	b->nranges = 0;
+	b->ndirty_in_batch = 0;
+}
+
+static inline bool
+hpa_batch_empty(hpa_purge_batch_t *b) {
+	return b->item_cnt == 0;
+}
+
+/* Purge pages in a batch using given hooks */
+void hpa_purge_batch(
+    hpa_hooks_t *hooks, hpa_purge_item_t *batch, size_t batch_sz);
+
+#endif /* JEMALLOC_INTERNAL_HPA_UTILS_H */
--- a/include/jemalloc/internal/hpdata.h
+++ b/include/jemalloc/internal/hpdata.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_HPDATA_H
 #define JEMALLOC_INTERNAL_HPDATA_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/fb.h"
+#include "jemalloc/internal/nstime.h"
+#include "jemalloc/internal/pages.h"
 #include "jemalloc/internal/ph.h"
 #include "jemalloc/internal/ql.h"
 #include "jemalloc/internal/typed_list.h"
@ -17,8 +20,14 @@
 * an observable property of any given region of address space).  It's just
 * hugepage-sized and hugepage-aligned; it's *potentially* huge.
 */
+
+/*
+ * The max enumeration num should not exceed 2^16 - 1, see comments in edata.h
+ * for ESET_ENUMERATE_MAX_NUM for more details.
+ */
+#define PSSET_ENUMERATE_MAX_NUM 32
 typedef struct hpdata_s hpdata_t;
-ph_structs(hpdata_age_heap, hpdata_t);
+ph_structs(hpdata_age_heap, hpdata_t, PSSET_ENUMERATE_MAX_NUM);
 struct hpdata_s {
 	/*
 	 * We likewise follow the edata convention of mangling names and forcing
@ -64,7 +73,7 @@ struct hpdata_s {
 	bool h_hugify_allowed;
 	/* When we became a hugification candidate. */
 	nstime_t h_time_hugify_allowed;
-	bool h_in_psset_hugify_container;
+	bool     h_in_psset_hugify_container;

 	/* Whether or not a purge or hugify is currently happening. */
 	bool h_mid_purge;
@ -115,6 +124,12 @@ struct hpdata_s {

 	/* The touched pages (using the same definition as above). */
 	fb_group_t touched_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
+
+	/* Time when this extent (hpdata) becomes eligible for purging */
+	nstime_t h_time_purge_allowed;
+
+	/* True if the extent was huge and empty last time when it was purged */
+	bool h_purged_when_empty_and_huge;
 };

 TYPED_LIST(hpdata_empty_list, hpdata_t, ql_link_empty)
@ -177,8 +192,8 @@ hpdata_purge_allowed_get(const hpdata_t *hpdata) {

 static inline void
 hpdata_purge_allowed_set(hpdata_t *hpdata, bool purge_allowed) {
-       assert(purge_allowed == false || !hpdata->h_mid_purge);
-       hpdata->h_purge_allowed = purge_allowed;
+	assert(purge_allowed == false || !hpdata->h_mid_purge);
+	hpdata->h_purge_allowed = purge_allowed;
 }

 static inline bool
@ -241,7 +256,6 @@ hpdata_changing_state_get(const hpdata_t *hpdata) {
 	return hpdata->h_mid_purge || hpdata->h_mid_hugify;
 }

-
 static inline bool
 hpdata_updating_get(const hpdata_t *hpdata) {
 	return hpdata->h_updating;
@ -276,17 +290,17 @@ hpdata_longest_free_range_set(hpdata_t *hpdata, size_t longest_free_range) {
 }

 static inline size_t
-hpdata_nactive_get(hpdata_t *hpdata) {
+hpdata_nactive_get(const hpdata_t *hpdata) {
 	return hpdata->h_nactive;
 }

 static inline size_t
-hpdata_ntouched_get(hpdata_t *hpdata) {
+hpdata_ntouched_get(const hpdata_t *hpdata) {
 	return hpdata->h_ntouched;
 }

 static inline size_t
-hpdata_ndirty_get(hpdata_t *hpdata) {
+hpdata_ndirty_get(const hpdata_t *hpdata) {
 	return hpdata->h_ntouched - hpdata->h_nactive;
 }

@ -295,6 +309,26 @@ hpdata_nretained_get(hpdata_t *hpdata) {
 	return HUGEPAGE_PAGES - hpdata->h_ntouched;
 }

+static inline void
+hpdata_time_purge_allowed_set(hpdata_t *hpdata, const nstime_t *v) {
+	nstime_copy(&hpdata->h_time_purge_allowed, v);
+}
+
+static inline const nstime_t *
+hpdata_time_purge_allowed_get(const hpdata_t *hpdata) {
+	return &hpdata->h_time_purge_allowed;
+}
+
+static inline bool
+hpdata_purged_when_empty_and_huge_get(const hpdata_t *hpdata) {
+	return hpdata->h_purged_when_empty_and_huge;
+}
+
+static inline void
+hpdata_purged_when_empty_and_huge_set(hpdata_t *hpdata, bool v) {
+	hpdata->h_purged_when_empty_and_huge = v;
+}
+
 static inline void
 hpdata_assert_empty(hpdata_t *hpdata) {
 	assert(fb_empty(hpdata->active_pages, HUGEPAGE_PAGES));
@ -308,58 +342,95 @@ hpdata_assert_empty(hpdata_t *hpdata) {
 */
 static inline bool
 hpdata_consistent(hpdata_t *hpdata) {
-	if(fb_urange_longest(hpdata->active_pages, HUGEPAGE_PAGES)
-	    != hpdata_longest_free_range_get(hpdata)) {
-		return false;
+	bool res = true;
+
+	const size_t active_urange_longest = fb_urange_longest(
+	    hpdata->active_pages, HUGEPAGE_PAGES);
+	const size_t longest_free_range = hpdata_longest_free_range_get(hpdata);
+	if (active_urange_longest != longest_free_range) {
+		malloc_printf(
+		    "<jemalloc>: active_fb_urange_longest=%zu != hpdata_longest_free_range=%zu\n",
+		    active_urange_longest, longest_free_range);
+		res = false;
 	}
-	if (fb_scount(hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)
-	    != hpdata->h_nactive) {
-		return false;
+
+	const size_t active_scount = fb_scount(
+	    hpdata->active_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
+	if (active_scount != hpdata->h_nactive) {
+		malloc_printf(
+		    "<jemalloc>: active_fb_scount=%zu != hpdata_nactive=%zu\n",
+		    active_scount, hpdata->h_nactive);
+		res = false;
 	}
-	if (fb_scount(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES)
-	    != hpdata->h_ntouched) {
-		return false;
+
+	const size_t touched_scount = fb_scount(
+	    hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
+	if (touched_scount != hpdata->h_ntouched) {
+		malloc_printf(
+		    "<jemalloc>: touched_fb_scount=%zu != hpdata_ntouched=%zu\n",
+		    touched_scount, hpdata->h_ntouched);
+		res = false;
 	}
+
 	if (hpdata->h_ntouched < hpdata->h_nactive) {
-		return false;
+		malloc_printf(
+		    "<jemalloc>: hpdata_ntouched=%zu < hpdata_nactive=%zu\n",
+		    hpdata->h_ntouched, hpdata->h_nactive);
+		res = false;
 	}
-	if (hpdata->h_huge && hpdata->h_ntouched != HUGEPAGE_PAGES) {
-		return false;
+
+	if (hpdata->h_huge && (hpdata->h_ntouched != HUGEPAGE_PAGES)) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_huge=%d && (hpdata_ntouched=%zu != hugepage_pages=%zu)\n",
+		    hpdata->h_huge, hpdata->h_ntouched, HUGEPAGE_PAGES);
+		res = false;
 	}
-	if (hpdata_changing_state_get(hpdata)
-	    && ((hpdata->h_purge_allowed) || hpdata->h_hugify_allowed)) {
-		return false;
+
+	const bool changing_state = hpdata_changing_state_get(hpdata);
+	if (changing_state
+	    && (hpdata->h_purge_allowed || hpdata->h_hugify_allowed)) {
+		malloc_printf(
+		    "<jemalloc>: hpdata_changing_state=%d && (hpdata_purge_allowed=%d || hpdata_hugify_allowed=%d)\n",
+		    changing_state, hpdata->h_purge_allowed,
+		    hpdata->h_hugify_allowed);
+		res = false;
 	}
+
 	if (hpdata_hugify_allowed_get(hpdata)
 	    != hpdata_in_psset_hugify_container_get(hpdata)) {
-		return false;
+		malloc_printf(
+		    "<jemalloc>: hpdata_hugify_allowed=%d != hpdata_in_psset_hugify_container=%d\n",
+		    hpdata_hugify_allowed_get(hpdata),
+		    hpdata_in_psset_hugify_container_get(hpdata));
+		res = false;
 	}
-	return true;
+
+	return res;
 }

-static inline void
-hpdata_assert_consistent(hpdata_t *hpdata) {
-	assert(hpdata_consistent(hpdata));
-}
+#define hpdata_assert_consistent(hpdata)                                       \
+	do {                                                                   \
+		assert(hpdata_consistent(hpdata));                             \
+	} while (0)

 static inline bool
-hpdata_empty(hpdata_t *hpdata) {
+hpdata_empty(const hpdata_t *hpdata) {
 	return hpdata->h_nactive == 0;
 }

 static inline bool
-hpdata_full(hpdata_t *hpdata) {
+hpdata_full(const hpdata_t *hpdata) {
 	return hpdata->h_nactive == HUGEPAGE_PAGES;
 }

-void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age);
+void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge);

 /*
 * Given an hpdata which can serve an allocation request, pick and reserve an
 * offset within that allocation.
 */
 void *hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz);
-void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
+void  hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz);

 /*
 * The hpdata_purge_prepare_t allows grabbing the metadata required to purge
@ -368,10 +439,10 @@ void hpdata_unreserve(hpdata_t *hpdata, void *begin, size_t sz);
 */
 typedef struct hpdata_purge_state_s hpdata_purge_state_t;
 struct hpdata_purge_state_s {
-	size_t npurged;
-	size_t ndirty_to_purge;
+	size_t     npurged;
+	size_t     ndirty_to_purge;
 	fb_group_t to_purge[FB_NGROUPS(HUGEPAGE_PAGES)];
-	size_t next_purge_search_begin;
+	size_t     next_purge_search_begin;
 };

 /*
@ -386,9 +457,11 @@ struct hpdata_purge_state_s {
 * until you're done, and then end.  Allocating out of an hpdata undergoing
 * purging is not allowed.
 *
- * Returns the number of dirty pages that will be purged.
+ * Returns the number of dirty pages that will be purged and sets nranges
+ * to number of ranges with dirty pages that will be purged.
 */
-size_t hpdata_purge_begin(hpdata_t *hpdata, hpdata_purge_state_t *purge_state);
+size_t hpdata_purge_begin(
+    hpdata_t *hpdata, hpdata_purge_state_t *purge_state, size_t *nranges);

 /*
 * If there are more extents to purge, sets *r_purge_addr and *r_purge_size to
--- a/include/jemalloc/internal/inspect.h
+++ b/include/jemalloc/internal/inspect.h
@ -1,6 +1,9 @@
 #ifndef JEMALLOC_INTERNAL_INSPECT_H
 #define JEMALLOC_INTERNAL_INSPECT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * This module contains the heap introspection capabilities.  For now they are
 * exposed purely through mallctl APIs in the experimental namespace, but this
@ -23,7 +26,7 @@ typedef struct inspect_extent_util_stats_verbose_s
    inspect_extent_util_stats_verbose_t;

 struct inspect_extent_util_stats_verbose_s {
-	void *slabcur_addr;
+	void  *slabcur_addr;
 	size_t nfree;
 	size_t nregs;
 	size_t size;
@ -31,10 +34,10 @@ struct inspect_extent_util_stats_verbose_s {
 	size_t bin_nregs;
 };

-void inspect_extent_util_stats_get(tsdn_t *tsdn, const void *ptr,
-    size_t *nfree, size_t *nregs, size_t *size);
+void inspect_extent_util_stats_get(
+    tsdn_t *tsdn, const void *ptr, size_t *nfree, size_t *nregs, size_t *size);
 void inspect_extent_util_stats_verbose_get(tsdn_t *tsdn, const void *ptr,
-    size_t *nfree, size_t *nregs, size_t *size,
-    size_t *bin_nfree, size_t *bin_nregs, void **slabcur_addr);
+    size_t *nfree, size_t *nregs, size_t *size, size_t *bin_nfree,
+    size_t *bin_nregs, void **slabcur_addr);

 #endif /* JEMALLOC_INTERNAL_INSPECT_H */
--- a/include/jemalloc/internal/jemalloc_internal_decls.h
+++ b/include/jemalloc/internal/jemalloc_internal_decls.h
@ -3,64 +3,65 @@

 #include <math.h>
 #ifdef _WIN32
-#  include <windows.h>
-#  include "msvc_compat/windows_extra.h"
-#  include "msvc_compat/strings.h"
-#  ifdef _WIN64
-#    if LG_VADDR <= 32
-#      error Generate the headers using x64 vcargs
-#    endif
-#  else
-#    if LG_VADDR > 32
-#      undef LG_VADDR
-#      define LG_VADDR 32
-#    endif
-#  endif
+#	include <windows.h>
+#	include "msvc_compat/windows_extra.h"
+#	include "msvc_compat/strings.h"
+#	ifdef _WIN64
+#		if LG_VADDR <= 32
+#			error Generate the headers using x64 vcargs
+#		endif
+#	else
+#		if LG_VADDR > 32
+#			undef LG_VADDR
+#			define LG_VADDR 32
+#		endif
+#	endif
 #else
-#  include <sys/param.h>
-#  include <sys/mman.h>
-#  if !defined(__pnacl__) && !defined(__native_client__)
-#    include <sys/syscall.h>
-#    if !defined(SYS_write) && defined(__NR_write)
-#      define SYS_write __NR_write
-#    endif
-#    if defined(SYS_open) && defined(__aarch64__)
-       /* Android headers may define SYS_open to __NR_open even though
+#	include <sys/param.h>
+#	include <sys/mman.h>
+#	if !defined(__pnacl__) && !defined(__native_client__)
+#		include <sys/syscall.h>
+#		if !defined(SYS_write) && defined(__NR_write)
+#			define SYS_write __NR_write
+#		endif
+#		if defined(SYS_open) && defined(__aarch64__)
+/* Android headers may define SYS_open to __NR_open even though
        * __NR_open may not exist on AArch64 (superseded by __NR_openat). */
-#      undef SYS_open
-#    endif
-#    include <sys/uio.h>
-#  endif
-#  include <pthread.h>
-#  if defined(__FreeBSD__) || defined(__DragonFly__)
-#  include <pthread_np.h>
-#  include <sched.h>
-#  if defined(__FreeBSD__)
-#    define cpu_set_t cpuset_t
-#  endif
-#  endif
-#  include <signal.h>
-#  ifdef JEMALLOC_OS_UNFAIR_LOCK
-#    include <os/lock.h>
-#  endif
-#  ifdef JEMALLOC_GLIBC_MALLOC_HOOK
-#    include <sched.h>
-#  endif
-#  include <errno.h>
-#  include <sys/time.h>
-#  include <time.h>
-#  ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
-#    include <mach/mach_time.h>
-#  endif
+#			undef SYS_open
+#		endif
+#		include <sys/uio.h>
+#	endif
+#	include <pthread.h>
+#	if defined(__FreeBSD__) || defined(__DragonFly__)                     \
+	    || defined(__OpenBSD__)
+#		include <pthread_np.h>
+#		include <sched.h>
+#		if defined(__FreeBSD__)
+#			define cpu_set_t cpuset_t
+#		endif
+#	endif
+#	include <signal.h>
+#	ifdef JEMALLOC_OS_UNFAIR_LOCK
+#		include <os/lock.h>
+#	endif
+#	ifdef JEMALLOC_GLIBC_MALLOC_HOOK
+#		include <sched.h>
+#	endif
+#	include <errno.h>
+#	include <sys/time.h>
+#	include <time.h>
+#	ifdef JEMALLOC_HAVE_MACH_ABSOLUTE_TIME
+#		include <mach/mach_time.h>
+#	endif
 #endif
 #include <sys/types.h>

 #include <limits.h>
 #ifndef SIZE_T_MAX
-#  define SIZE_T_MAX	SIZE_MAX
+#	define SIZE_T_MAX SIZE_MAX
 #endif
 #ifndef SSIZE_MAX
-#  define SSIZE_MAX	((ssize_t)(SIZE_T_MAX >> 1))
+#	define SSIZE_MAX ((ssize_t)(SIZE_T_MAX >> 1))
 #endif
 #include <stdarg.h>
 #include <stdbool.h>
@ -69,30 +70,30 @@
 #include <stdint.h>
 #include <stddef.h>
 #ifndef offsetof
-#  define offsetof(type, member)	((size_t)&(((type *)NULL)->member))
+#	define offsetof(type, member) ((size_t) & (((type *)NULL)->member))
 #endif
 #include <string.h>
 #include <strings.h>
 #include <ctype.h>
 #ifdef _MSC_VER
-#  include <io.h>
+#	include <io.h>
 typedef intptr_t ssize_t;
-#  define PATH_MAX 1024
-#  define STDERR_FILENO 2
-#  define __func__ __FUNCTION__
-#  ifdef JEMALLOC_HAS_RESTRICT
-#    define restrict __restrict
-#  endif
+#	define PATH_MAX 1024
+#	define STDERR_FILENO 2
+#	define __func__ __FUNCTION__
+#	ifdef JEMALLOC_HAS_RESTRICT
+#		define restrict __restrict
+#	endif
 /* Disable warnings about deprecated system functions. */
-#  pragma warning(disable: 4996)
-#if _MSC_VER < 1800
+#	pragma warning(disable : 4996)
+#	if _MSC_VER < 1800
 static int
 isblank(int c) {
 	return (c == '\t' || c == ' ');
 }
-#endif
+#	endif
 #else
-#  include <unistd.h>
+#	include <unistd.h>
 #endif
 #include <fcntl.h>

@ -102,7 +103,24 @@ isblank(int c) {
 * classes.
 */
 #ifdef small
-#  undef small
+#	undef small
 #endif

+/*
+ * Oftentimes we'd like to perform some kind of arithmetic to obtain
+ * a pointer from another pointer but with some offset or mask applied.
+ * Naively you would accomplish this by casting the source pointer to
+ * `uintptr_t`, performing all of the relevant arithmetic, and then casting
+ * the result to the desired pointer type. However, this has the unfortunate
+ * side-effect of concealing pointer provenance, hiding useful information for
+ * optimization from the compiler (see here for details:
+ * https://clang.llvm.org/extra/clang-tidy/checks/performance/no-int-to-ptr.html
+ * )
+ * Instead what one should do is cast the source pointer to `char *` and perform
+ * the equivalent arithmetic (since `char` of course represents one byte). But
+ * because `char *` has the semantic meaning of "string", we define this typedef
+ * simply to make it clearer where we are performing such pointer arithmetic.
+ */
+typedef char byte_t;
+
 #endif /* JEMALLOC_INTERNAL_H */
--- a/include/jemalloc/internal/jemalloc_internal_defs.h.in
+++ b/include/jemalloc/internal/jemalloc_internal_defs.h.in
@ -14,10 +14,13 @@
 */
 #undef JEMALLOC_OVERRIDE___LIBC_CALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_FREE
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_SIZED
+#undef JEMALLOC_OVERRIDE___LIBC_FREE_ALIGNED_SIZED
 #undef JEMALLOC_OVERRIDE___LIBC_MALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_MEMALIGN
 #undef JEMALLOC_OVERRIDE___LIBC_REALLOC
 #undef JEMALLOC_OVERRIDE___LIBC_VALLOC
+#undef JEMALLOC_OVERRIDE___LIBC_PVALLOC
 #undef JEMALLOC_OVERRIDE___POSIX_MEMALIGN

 /*
@ -88,6 +91,9 @@
 /* Defined if pthread_getname_np(3) is available. */
 #undef JEMALLOC_HAVE_PTHREAD_GETNAME_NP

+/* Defined if pthread_set_name_np(3) is available. */
+#undef JEMALLOC_HAVE_PTHREAD_SET_NAME_NP
+
 /* Defined if pthread_get_name_np(3) is available. */
 #undef JEMALLOC_HAVE_PTHREAD_GET_NAME_NP

@ -111,6 +117,11 @@
 */
 #undef JEMALLOC_HAVE_CLOCK_REALTIME

+/*
+ * Defined if clock_gettime_nsec_np(CLOCK_UPTIME_RAW) is available.
+ */
+#undef JEMALLOC_HAVE_CLOCK_GETTIME_NSEC_NP
+
 /*
 * Defined if _malloc_thread_cleanup() exists.  At least in the case of
 * FreeBSD, pthread_key_create() allocates, which if used during malloc
@ -161,6 +172,15 @@
 /* Use gcc intrinsics for profile backtracing if defined. */
 #undef JEMALLOC_PROF_GCC

+/* Use frame pointer for profile backtracing if defined. Linux only. */
+#undef JEMALLOC_PROF_FRAME_POINTER
+
+/* JEMALLOC_PAGEID enabled page id */
+#undef JEMALLOC_PAGEID
+
+/* JEMALLOC_HAVE_PRCTL checks prctl */
+#undef JEMALLOC_HAVE_PRCTL
+
 /*
 * JEMALLOC_DSS enables use of sbrk(2) to allocate extents from the data storage
 * segment (DSS).
@ -259,6 +279,12 @@
 */
 #undef JEMALLOC_READLINKAT

+/*
+ * If defined, use getenv() (instead of secure_getenv() or
+ * alternatives) to access MALLOC_CONF.
+ */
+#undef JEMALLOC_FORCE_GETENV
+
 /*
 * Darwin (OS X) uses zones to work around Mach-O symbol override shortcomings.
 */
@ -282,6 +308,13 @@
 */
 #undef JEMALLOC_HAVE_MADVISE_HUGE

+/*
+ * Defined if best-effort synchronous collapse of the native
+ * pages mapped by the memory range into transparent huge pages is supported
+ * via MADV_COLLAPSE arguments to madvise(2).
+ */
+#undef JEMALLOC_HAVE_MADVISE_COLLAPSE
+
 /*
 * Methods for purging unused pages differ between operating systems.
 *
@ -312,9 +345,23 @@
 */
 #undef JEMALLOC_MADVISE_NOCORE

+/* Defined if process_madvise(2) is available. */
+#undef JEMALLOC_HAVE_PROCESS_MADVISE
+
+#undef EXPERIMENTAL_SYS_PROCESS_MADVISE_NR
+
 /* Defined if mprotect(2) is available. */
 #undef JEMALLOC_HAVE_MPROTECT

+/* Defined if sys/sdt.h is available and sdt tracing enabled */
+#undef JEMALLOC_EXPERIMENTAL_USDT_STAP
+
+/*
+ * Defined if sys/sdt.h is unavailable, sdt tracing enabled, and
+ * platform is supported
+ */
+#undef JEMALLOC_EXPERIMENTAL_USDT_CUSTOM
+
 /*
 * Defined if transparent huge pages (THPs) are supported via the
 * MADV_[NO]HUGEPAGE arguments to madvise(2), and THP support is enabled.
@ -378,12 +425,18 @@
 /* Adaptive mutex support in pthreads. */
 #undef JEMALLOC_HAVE_PTHREAD_MUTEX_ADAPTIVE_NP

+/* gettid() support */
+#undef JEMALLOC_HAVE_GETTID
+
 /* GNU specific sched_getcpu support */
 #undef JEMALLOC_HAVE_SCHED_GETCPU

 /* GNU specific sched_setaffinity support */
 #undef JEMALLOC_HAVE_SCHED_SETAFFINITY

+/* pthread_setaffinity_np support */
+#undef JEMALLOC_HAVE_PTHREAD_SETAFFINITY_NP
+
 /*
 * If defined, all the features necessary for background threads are present.
 */
@ -424,4 +477,18 @@
 /* If defined, realloc(ptr, 0) defaults to "free" instead of "alloc". */
 #undef JEMALLOC_ZERO_REALLOC_DEFAULT_FREE

+/* If defined, use volatile asm during benchmarks. */
+#undef JEMALLOC_HAVE_ASM_VOLATILE
+
+/*
+ * If defined, support the use of rdtscp to get the time stamp counter
+ * and the processor ID.
+ */
+#undef JEMALLOC_HAVE_RDTSCP
+
+/* If defined, use __int128 for optimization. */
+#undef JEMALLOC_HAVE_INT128
+
+#include "jemalloc/internal/jemalloc_internal_overrides.h"
+
 #endif /* JEMALLOC_INTERNAL_DEFS_H_ */
--- a/include/jemalloc/internal/jemalloc_internal_externs.h
+++ b/include/jemalloc/internal/jemalloc_internal_externs.h
@ -1,39 +1,52 @@
 #ifndef JEMALLOC_INTERNAL_EXTERNS_H
 #define JEMALLOC_INTERNAL_EXTERNS_H

+#include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/fxp.h"
 #include "jemalloc/internal/hpa_opts.h"
+#include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/sec_opts.h"
 #include "jemalloc/internal/tsd_types.h"
-#include "jemalloc/internal/nstime.h"

 /* TSD checks this to set thread local slow state accordingly. */
 extern bool malloc_slow;

 /* Run-time options. */
-extern bool opt_abort;
-extern bool opt_abort_conf;
-extern bool opt_trust_madvise;
-extern bool opt_confirm_conf;
-extern bool opt_hpa;
+extern bool             opt_abort;
+extern bool             opt_abort_conf;
+extern bool             opt_trust_madvise;
+extern bool             opt_experimental_hpa_start_huge_if_thp_always;
+extern bool             opt_experimental_hpa_enforce_hugify;
+extern bool             opt_confirm_conf;
+extern bool             opt_hpa;
 extern hpa_shard_opts_t opt_hpa_opts;
-extern sec_opts_t opt_hpa_sec_opts;
+extern sec_opts_t       opt_hpa_sec_opts;

 extern const char *opt_junk;
-extern bool opt_junk_alloc;
-extern bool opt_junk_free;
-extern void (*junk_free_callback)(void *ptr, size_t size);
-extern void (*junk_alloc_callback)(void *ptr, size_t size);
-extern bool opt_utrace;
-extern bool opt_xmalloc;
-extern bool opt_experimental_infallible_new;
-extern bool opt_zero;
-extern unsigned opt_narenas;
+extern bool        opt_junk_alloc;
+extern bool        opt_junk_free;
+extern void (*JET_MUTABLE junk_free_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE junk_alloc_callback)(void *ptr, size_t size);
+extern void (*JET_MUTABLE invalid_conf_abort)(void);
+extern bool                  opt_utrace;
+extern bool                  opt_xmalloc;
+extern bool                  opt_experimental_infallible_new;
+extern bool                  opt_experimental_tcache_gc;
+extern bool                  opt_zero;
+extern unsigned              opt_narenas;
+extern fxp_t                 opt_narenas_ratio;
 extern zero_realloc_action_t opt_zero_realloc_action;
-extern malloc_init_t malloc_init_state;
-extern const char *zero_realloc_mode_names[];
-extern atomic_zu_t zero_realloc_count;
-extern bool opt_cache_oblivious;
+extern malloc_init_t         malloc_init_state;
+extern const char *const     zero_realloc_mode_names[];
+extern atomic_zu_t           zero_realloc_count;
+extern bool                  opt_cache_oblivious;
+extern unsigned              opt_debug_double_free_max_scan;
+extern size_t                opt_calloc_madvise_threshold;
+extern bool                  opt_disable_large_size_classes;
+
+extern const char *opt_malloc_conf_symlink;
+extern const char *opt_malloc_conf_env_var;

 /* Escape free-fastpath when ptr & mask == 0 (for sanitization purpose). */
 extern uintptr_t san_cache_bin_nonfast_mask;
@ -53,23 +66,26 @@ extern unsigned manual_arena_base;
 */
 extern atomic_p_t arenas[];

-void *a0malloc(size_t size);
-void a0dalloc(void *ptr);
-void *bootstrap_malloc(size_t size);
-void *bootstrap_calloc(size_t num, size_t size);
-void bootstrap_free(void *ptr);
-void arena_set(unsigned ind, arena_t *arena);
+extern unsigned huge_arena_ind;
+
+void    *a0malloc(size_t size);
+void     a0dalloc(void *ptr);
+void    *bootstrap_malloc(size_t size);
+void    *bootstrap_calloc(size_t num, size_t size);
+void     bootstrap_free(void *ptr);
+void     arena_set(unsigned ind, arena_t *arena);
 unsigned narenas_total_get(void);
 arena_t *arena_init(tsdn_t *tsdn, unsigned ind, const arena_config_t *config);
 arena_t *arena_choose_hard(tsd_t *tsd, bool internal);
-void arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena);
-void iarena_cleanup(tsd_t *tsd);
-void arena_cleanup(tsd_t *tsd);
-size_t batch_alloc(void **ptrs, size_t num, size_t size, int flags);
-void jemalloc_prefork(void);
-void jemalloc_postfork_parent(void);
-void jemalloc_postfork_child(void);
-void je_sdallocx_noflags(void *ptr, size_t size);
-void *malloc_default(size_t size);
+void     arena_migrate(tsd_t *tsd, arena_t *oldarena, arena_t *newarena);
+void     iarena_cleanup(tsd_t *tsd);
+void     arena_cleanup(tsd_t *tsd);
+size_t   batch_alloc(void **ptrs, size_t num, size_t size, int flags);
+void     jemalloc_prefork(void);
+void     jemalloc_postfork_parent(void);
+void     jemalloc_postfork_child(void);
+void     sdallocx_default(void *ptr, size_t size, int flags);
+void     free_default(void *ptr);
+void    *malloc_default(size_t size);

 #endif /* JEMALLOC_INTERNAL_EXTERNS_H */
--- a/include/jemalloc/internal/jemalloc_internal_inlines_a.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_a.h
@ -1,10 +1,14 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_A_H
 #define JEMALLOC_INTERNAL_INLINES_A_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_types.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/bit_util.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/sc.h"
+#include "jemalloc/internal/tcache_externs.h"
 #include "jemalloc/internal/ticker.h"

 JEMALLOC_ALWAYS_INLINE malloc_cpuid_t
@ -14,6 +18,15 @@ malloc_getcpu(void) {
 	return GetCurrentProcessorNumber();
 #elif defined(JEMALLOC_HAVE_SCHED_GETCPU)
 	return (malloc_cpuid_t)sched_getcpu();
+#elif defined(JEMALLOC_HAVE_RDTSCP)
+	unsigned int ecx;
+	asm volatile("rdtscp" : "=c"(ecx)::"eax", "edx");
+	return (malloc_cpuid_t)(ecx & 0xfff);
+#elif defined(__aarch64__) && defined(__APPLE__)
+	/* Other oses most likely use tpidr_el0 instead */
+	uintptr_t c;
+	asm volatile("mrs %x0, tpidrro_el0" : "=r"(c)::"memory");
+	return (malloc_cpuid_t)(c & (1 << 3) - 1);
 #else
 	not_reached();
 	return -1;
@ -29,8 +42,8 @@ percpu_arena_choose(void) {
 	assert(cpuid >= 0);

 	unsigned arena_ind;
-	if ((opt_percpu_arena == percpu_arena) || ((unsigned)cpuid < ncpus /
-	    2)) {
+	if ((opt_percpu_arena == percpu_arena)
+	    || ((unsigned)cpuid < ncpus / 2)) {
 		arena_ind = cpuid;
 	} else {
 		assert(opt_percpu_arena == per_phycpu_arena);
--- a/include/jemalloc/internal/jemalloc_internal_inlines_b.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_b.h
@ -1,7 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_B_H
 #define JEMALLOC_INTERNAL_INLINES_B_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_inlines_a.h"
 #include "jemalloc/internal/extent.h"
+#include "jemalloc/internal/jemalloc_internal_inlines_a.h"

 static inline void
 percpu_arena_update(tsd_t *tsd, unsigned cpu) {
@ -20,13 +23,13 @@ percpu_arena_update(tsd_t *tsd, unsigned cpu) {
 		tcache_t *tcache = tcache_get(tsd);
 		if (tcache != NULL) {
 			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
-			tcache_arena_reassociate(tsd_tsdn(tsd), tcache_slow,
-			    tcache, newarena);
+			assert(tcache_slow->arena != NULL);
+			tcache_arena_reassociate(
+			    tsd_tsdn(tsd), tcache_slow, tcache, newarena);
 		}
 	}
 }

-
 /* Choose an arena based on a per-thread value. */
 static inline arena_t *
 arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
@ -47,18 +50,18 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
 		assert(ret);
 		if (tcache_available(tsd)) {
 			tcache_slow_t *tcache_slow = tsd_tcache_slowp_get(tsd);
-			tcache_t *tcache = tsd_tcachep_get(tsd);
+			tcache_t      *tcache = tsd_tcachep_get(tsd);
 			if (tcache_slow->arena != NULL) {
 				/* See comments in tsd_tcache_data_init().*/
-				assert(tcache_slow->arena ==
-				    arena_get(tsd_tsdn(tsd), 0, false));
+				assert(tcache_slow->arena
+				    == arena_get(tsd_tsdn(tsd), 0, false));
 				if (tcache_slow->arena != ret) {
 					tcache_arena_reassociate(tsd_tsdn(tsd),
 					    tcache_slow, tcache, ret);
 				}
 			} else {
-				tcache_arena_associate(tsd_tsdn(tsd),
-				    tcache_slow, tcache, ret);
+				tcache_arena_associate(
+				    tsd_tsdn(tsd), tcache_slow, tcache, ret);
 			}
 		}
 	}
@ -68,10 +71,10 @@ arena_choose_impl(tsd_t *tsd, arena_t *arena, bool internal) {
 	 * auto percpu arena range, (i.e. thread is assigned to a manually
 	 * managed arena), then percpu arena is skipped.
 	 */
-	if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena) &&
-	    !internal && (arena_ind_get(ret) <
-	    percpu_arena_ind_limit(opt_percpu_arena)) && (ret->last_thd !=
-	    tsd_tsdn(tsd))) {
+	if (have_percpu_arena && PERCPU_ARENA_ENABLED(opt_percpu_arena)
+	    && !internal
+	    && (arena_ind_get(ret) < percpu_arena_ind_limit(opt_percpu_arena))
+	    && (ret->last_thd != tsd_tsdn(tsd))) {
 		unsigned ind = percpu_arena_choose();
 		if (arena_ind_get(ret) != ind) {
 			percpu_arena_update(tsd, ind);
--- a/include/jemalloc/internal/jemalloc_internal_inlines_c.h
+++ b/include/jemalloc/internal/jemalloc_internal_inlines_c.h
@ -1,6 +1,10 @@
 #ifndef JEMALLOC_INTERNAL_INLINES_C_H
 #define JEMALLOC_INTERNAL_INLINES_C_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/arena_externs.h"
+#include "jemalloc/internal/arena_inlines_b.h"
+#include "jemalloc/internal/emap.h"
 #include "jemalloc/internal/hook.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"
 #include "jemalloc/internal/log.h"
@ -8,6 +12,15 @@
 #include "jemalloc/internal/thread_event.h"
 #include "jemalloc/internal/witness.h"

+/*
+ * These correspond to the macros in jemalloc/jemalloc_macros.h.  Broadly, we
+ * should have one constant here per magic value there.  Note however that the
+ * representations need not be related.
+ */
+#define TCACHE_IND_NONE ((unsigned)-1)
+#define TCACHE_IND_AUTOMATIC ((unsigned)-2)
+#define ARENA_IND_AUTOMATIC ((unsigned)-1)
+
 /*
 * Translating the names of the 'i' functions:
 *   Abbreviations used in the first part of the function name (before
@ -41,24 +54,35 @@ isalloc(tsdn_t *tsdn, const void *ptr) {
 }

 JEMALLOC_ALWAYS_INLINE void *
-iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
-    bool is_internal, arena_t *arena, bool slow_path) {
+iallocztm_explicit_slab(tsdn_t *tsdn, size_t size, szind_t ind, bool zero,
+    bool slab, tcache_t *tcache, bool is_internal, arena_t *arena,
+    bool slow_path) {
 	void *ret;

+	assert(!slab || sz_can_use_slab(size)); /* slab && large is illegal */
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
 	if (!tsdn_null(tsdn) && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) == 0) {
-		witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-		    WITNESS_RANK_CORE, 0);
+		witness_assert_depth_to_rank(
+		    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
 	}

-	ret = arena_malloc(tsdn, arena, size, ind, zero, tcache, slow_path);
+	ret = arena_malloc(
+	    tsdn, arena, size, ind, zero, slab, tcache, slow_path);
 	if (config_stats && is_internal && likely(ret != NULL)) {
 		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
 	}
 	return ret;
 }

+JEMALLOC_ALWAYS_INLINE void *
+iallocztm(tsdn_t *tsdn, size_t size, szind_t ind, bool zero, tcache_t *tcache,
+    bool is_internal, arena_t *arena, bool slow_path) {
+	bool slab = sz_can_use_slab(size);
+	return iallocztm_explicit_slab(
+	    tsdn, size, ind, zero, slab, tcache, is_internal, arena, slow_path);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
 	return iallocztm(tsd_tsdn(tsd), size, ind, zero, tcache_get(tsd), false,
@ -66,18 +90,19 @@ ialloc(tsd_t *tsd, size_t size, szind_t ind, bool zero, bool slow_path) {
 }

 JEMALLOC_ALWAYS_INLINE void *
-ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
-    tcache_t *tcache, bool is_internal, arena_t *arena) {
+ipallocztm_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment,
+    bool zero, bool slab, tcache_t *tcache, bool is_internal, arena_t *arena) {
 	void *ret;

+	assert(!slab || sz_can_use_slab(usize)); /* slab && large is illegal */
 	assert(usize != 0);
 	assert(usize == sz_sa2u(usize, alignment));
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena == NULL || arena_is_auto(arena));
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);

-	ret = arena_palloc(tsdn, arena, usize, alignment, zero, tcache);
+	ret = arena_palloc(tsdn, arena, usize, alignment, zero, slab, tcache);
 	assert(ALIGNMENT_ADDR2BASE(ret, alignment) == ret);
 	if (config_stats && is_internal && likely(ret != NULL)) {
 		arena_internal_add(iaalloc(tsdn, ret), isalloc(tsdn, ret));
@ -85,12 +110,26 @@ ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
 	return ret;
 }

+JEMALLOC_ALWAYS_INLINE void *
+ipallocztm(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    tcache_t *tcache, bool is_internal, arena_t *arena) {
+	return ipallocztm_explicit_slab(tsdn, usize, alignment, zero,
+	    sz_can_use_slab(usize), tcache, is_internal, arena);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ipalloct(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
    tcache_t *tcache, arena_t *arena) {
 	return ipallocztm(tsdn, usize, alignment, zero, tcache, false, arena);
 }

+JEMALLOC_ALWAYS_INLINE void *
+ipalloct_explicit_slab(tsdn_t *tsdn, size_t usize, size_t alignment, bool zero,
+    bool slab, tcache_t *tcache, arena_t *arena) {
+	return ipallocztm_explicit_slab(
+	    tsdn, usize, alignment, zero, slab, tcache, false, arena);
+}
+
 JEMALLOC_ALWAYS_INLINE void *
 ipalloc(tsd_t *tsd, size_t usize, size_t alignment, bool zero) {
 	return ipallocztm(tsd_tsdn(tsd), usize, alignment, zero,
@ -108,13 +147,13 @@ idalloctm(tsdn_t *tsdn, void *ptr, tcache_t *tcache,
 	assert(ptr != NULL);
 	assert(!is_internal || tcache == NULL);
 	assert(!is_internal || arena_is_auto(iaalloc(tsdn, ptr)));
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
 	if (config_stats && is_internal) {
 		arena_internal_sub(iaalloc(tsdn, ptr), isalloc(tsdn, ptr));
 	}
-	if (!is_internal && !tsdn_null(tsdn) &&
-	    tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
+	if (!is_internal && !tsdn_null(tsdn)
+	    && tsd_reentrancy_level_get(tsdn_tsd(tsdn)) != 0) {
 		assert(tcache == NULL);
 	}
 	arena_dalloc(tsdn, ptr, tcache, alloc_ctx, slow_path);
@ -128,25 +167,26 @@ idalloc(tsd_t *tsd, void *ptr) {
 JEMALLOC_ALWAYS_INLINE void
 isdalloct(tsdn_t *tsdn, void *ptr, size_t size, tcache_t *tcache,
    emap_alloc_ctx_t *alloc_ctx, bool slow_path) {
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
 	arena_sdalloc(tsdn, ptr, size, tcache, alloc_ctx, slow_path);
 }

 JEMALLOC_ALWAYS_INLINE void *
 iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
-    size_t alignment, bool zero, tcache_t *tcache, arena_t *arena,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
    hook_ralloc_args_t *hook_args) {
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
-	void *p;
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);
+	void  *p;
 	size_t usize, copysize;

 	usize = sz_sa2u(size, alignment);
 	if (unlikely(usize == 0 || usize > SC_LARGE_MAXCLASS)) {
 		return NULL;
 	}
-	p = ipalloct(tsdn, usize, alignment, zero, tcache, arena);
+	p = ipalloct_explicit_slab(
+	    tsdn, usize, alignment, zero, slab, tcache, arena);
 	if (p == NULL) {
 		return NULL;
 	}
@ -156,11 +196,12 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 	 */
 	copysize = (size < oldsize) ? size : oldsize;
 	memcpy(p, ptr, copysize);
-	hook_invoke_alloc(hook_args->is_realloc
-	    ? hook_alloc_realloc : hook_alloc_rallocx, p, (uintptr_t)p,
-	    hook_args->args);
-	hook_invoke_dalloc(hook_args->is_realloc
-	    ? hook_dalloc_realloc : hook_dalloc_rallocx, ptr, hook_args->args);
+	hook_invoke_alloc(
+	    hook_args->is_realloc ? hook_alloc_realloc : hook_alloc_rallocx, p,
+	    (uintptr_t)p, hook_args->args);
+	hook_invoke_dalloc(
+	    hook_args->is_realloc ? hook_dalloc_realloc : hook_dalloc_rallocx,
+	    ptr, hook_args->args);
 	isdalloct(tsdn, ptr, oldsize, tcache, NULL, true);
 	return p;
 }
@ -173,33 +214,42 @@ iralloct_realign(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
 * passed-around anywhere.
 */
 JEMALLOC_ALWAYS_INLINE void *
-iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, tcache_t *tcache, arena_t *arena, hook_ralloc_args_t *hook_args)
-{
+iralloct_explicit_slab(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size,
+    size_t alignment, bool zero, bool slab, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
 	assert(ptr != NULL);
 	assert(size != 0);
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);

-	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
-	    != 0) {
+	if (alignment != 0
+	    && ((uintptr_t)ptr & ((uintptr_t)alignment - 1)) != 0) {
 		/*
 		 * Existing object alignment is inadequate; allocate new space
 		 * and copy.
 		 */
 		return iralloct_realign(tsdn, ptr, oldsize, size, alignment,
-		    zero, tcache, arena, hook_args);
+		    zero, slab, tcache, arena, hook_args);
 	}

 	return arena_ralloc(tsdn, arena, ptr, oldsize, size, alignment, zero,
-	    tcache, hook_args);
+	    slab, tcache, hook_args);
+}
+
+JEMALLOC_ALWAYS_INLINE void *
+iralloct(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t alignment,
+    size_t usize, bool zero, tcache_t *tcache, arena_t *arena,
+    hook_ralloc_args_t *hook_args) {
+	bool slab = sz_can_use_slab(usize);
+	return iralloct_explicit_slab(tsdn, ptr, oldsize, size, alignment, zero,
+	    slab, tcache, arena, hook_args);
 }

 JEMALLOC_ALWAYS_INLINE void *
 iralloc(tsd_t *tsd, void *ptr, size_t oldsize, size_t size, size_t alignment,
-    bool zero, hook_ralloc_args_t *hook_args) {
-	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, zero,
-	    tcache_get(tsd), NULL, hook_args);
+    size_t usize, bool zero, hook_ralloc_args_t *hook_args) {
+	return iralloct(tsd_tsdn(tsd), ptr, oldsize, size, alignment, usize,
+	    zero, tcache_get(tsd), NULL, hook_args);
 }

 JEMALLOC_ALWAYS_INLINE bool
@ -207,29 +257,27 @@ ixalloc(tsdn_t *tsdn, void *ptr, size_t oldsize, size_t size, size_t extra,
    size_t alignment, bool zero, size_t *newsize) {
 	assert(ptr != NULL);
 	assert(size != 0);
-	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
-	    WITNESS_RANK_CORE, 0);
+	witness_assert_depth_to_rank(
+	    tsdn_witness_tsdp_get(tsdn), WITNESS_RANK_CORE, 0);

-	if (alignment != 0 && ((uintptr_t)ptr & ((uintptr_t)alignment-1))
-	    != 0) {
+	if (alignment != 0
+	    && ((uintptr_t)ptr & ((uintptr_t)alignment - 1)) != 0) {
 		/* Existing object alignment is inadequate. */
 		*newsize = oldsize;
 		return true;
 	}

-	return arena_ralloc_no_move(tsdn, ptr, oldsize, size, extra, zero,
-	    newsize);
+	return arena_ralloc_no_move(
+	    tsdn, ptr, oldsize, size, extra, zero, newsize);
 }

 JEMALLOC_ALWAYS_INLINE void
-fastpath_success_finish(tsd_t *tsd, uint64_t allocated_after,
-    cache_bin_t *bin, void *ret) {
+fastpath_success_finish(
+    tsd_t *tsd, uint64_t allocated_after, cache_bin_t *bin, void *ret) {
 	thread_allocated_set(tsd, allocated_after);
 	if (config_stats) {
 		bin->tstats.nrequests++;
 	}
-
-	LOG("core.malloc.exit", "result: %p", ret);
 }

 JEMALLOC_ALWAYS_INLINE bool
@ -256,7 +304,6 @@ malloc_initialized(void) {
 */
 JEMALLOC_ALWAYS_INLINE void *
 imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
-	LOG("core.malloc.entry", "size: %zu", size);
 	if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
 		return fallback_alloc(size);
 	}
@ -284,8 +331,8 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
 	sz_size2index_usize_fastpath(size, &ind, &usize);
 	/* Fast path relies on size being a bin. */
 	assert(ind < SC_NBINS);
-	assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS) &&
-	    (size <= SC_SMALL_MAXCLASS));
+	assert((SC_LOOKUP_MAXCLASS < SC_SMALL_MAXCLASS)
+	    && (size <= SC_SMALL_MAXCLASS));

 	uint64_t allocated, threshold;
 	te_malloc_fastpath_ctx(tsd, &allocated, &threshold);
@ -314,7 +361,9 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
 	tcache_t *tcache = tsd_tcachep_get(tsd);
 	assert(tcache == tcache_get(tsd));
 	cache_bin_t *bin = &tcache->bins[ind];
-	bool tcache_success;
+	/* Suppress spurious warning from static analysis */
+	assert(bin != NULL);
+	bool  tcache_success;
 	void *ret;

 	/*
@ -337,4 +386,215 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
 	return fallback_alloc(size);
 }

+JEMALLOC_ALWAYS_INLINE tcache_t *
+tcache_get_from_ind(tsd_t *tsd, unsigned tcache_ind, bool slow, bool is_alloc) {
+	tcache_t *tcache;
+	if (tcache_ind == TCACHE_IND_AUTOMATIC) {
+		if (likely(!slow)) {
+			/* Getting tcache ptr unconditionally. */
+			tcache = tsd_tcachep_get(tsd);
+			assert(tcache == tcache_get(tsd));
+		} else if (is_alloc
+		    || likely(tsd_reentrancy_level_get(tsd) == 0)) {
+			tcache = tcache_get(tsd);
+		} else {
+			tcache = NULL;
+		}
+	} else {
+		/*
+                 * Should not specify tcache on deallocation path when being
+                 * reentrant.
+                 */
+		assert(is_alloc || tsd_reentrancy_level_get(tsd) == 0
+		    || tsd_state_nocleanup(tsd));
+		if (tcache_ind == TCACHE_IND_NONE) {
+			tcache = NULL;
+		} else {
+			tcache = tcaches_get(tsd, tcache_ind);
+		}
+	}
+	return tcache;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+maybe_check_alloc_ctx(tsd_t *tsd, void *ptr, emap_alloc_ctx_t *alloc_ctx) {
+	if (config_opt_size_checks) {
+		emap_alloc_ctx_t dbg_ctx;
+		emap_alloc_ctx_lookup(
+		    tsd_tsdn(tsd), &arena_emap_global, ptr, &dbg_ctx);
+		if (alloc_ctx->szind != dbg_ctx.szind) {
+			safety_check_fail_sized_dealloc(
+			    /* current_dealloc */ true, ptr,
+			    /* true_size */ emap_alloc_ctx_usize_get(&dbg_ctx),
+			    /* input_size */
+			    emap_alloc_ctx_usize_get(alloc_ctx));
+			return true;
+		}
+		if (alloc_ctx->slab != dbg_ctx.slab) {
+			safety_check_fail(
+			    "Internal heap corruption detected: "
+			    "mismatch in slab bit");
+			return true;
+		}
+	}
+	return false;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+prof_sample_aligned(const void *ptr) {
+	return ((uintptr_t)ptr & PROF_SAMPLE_ALIGNMENT_MASK) == 0;
+}
+
+JEMALLOC_ALWAYS_INLINE bool
+free_fastpath_nonfast_aligned(void *ptr, bool check_prof) {
+	/*
+         * free_fastpath do not handle two uncommon cases: 1) sampled profiled
+         * objects and 2) sampled junk & stash for use-after-free detection.
+         * Both have special alignments which are used to escape the fastpath.
+         *
+         * prof_sample is page-aligned, which covers the UAF check when both
+         * are enabled (the assertion below).  Avoiding redundant checks since
+         * this is on the fastpath -- at most one runtime branch from this.
+         */
+	if (config_debug && cache_bin_nonfast_aligned(ptr)) {
+		assert(prof_sample_aligned(ptr));
+	}
+
+	if (config_prof && check_prof) {
+		/* When prof is enabled, the prof_sample alignment is enough. */
+		if (prof_sample_aligned(ptr)) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	if (config_uaf_detection) {
+		if (cache_bin_nonfast_aligned(ptr)) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	return false;
+}
+
+/* Returns whether or not the free attempt was successful. */
+JEMALLOC_ALWAYS_INLINE
+bool
+free_fastpath(void *ptr, size_t size, bool size_hint) {
+	tsd_t *tsd = tsd_get(false);
+	/* The branch gets optimized away unless tsd_get_allocates(). */
+	if (unlikely(tsd == NULL)) {
+		return false;
+	}
+	/*
+         *  The tsd_fast() / initialized checks are folded into the branch
+         *  testing (deallocated_after >= threshold) later in this function.
+         *  The threshold will be set to 0 when !tsd_fast.
+         */
+	assert(tsd_fast(tsd)
+	    || *tsd_thread_deallocated_next_event_fastp_get_unsafe(tsd) == 0);
+
+	emap_alloc_ctx_t alloc_ctx JEMALLOC_CC_SILENCE_INIT({0, 0, false});
+	size_t                     usize;
+	if (!size_hint) {
+		bool err = emap_alloc_ctx_try_lookup_fast(
+		    tsd, &arena_emap_global, ptr, &alloc_ctx);
+
+		/* Note: profiled objects will have alloc_ctx.slab set */
+		if (unlikely(err || !alloc_ctx.slab
+		        || free_fastpath_nonfast_aligned(ptr,
+		            /* check_prof */ false))) {
+			return false;
+		}
+		assert(alloc_ctx.szind != SC_NSIZES);
+		usize = sz_index2size(alloc_ctx.szind);
+	} else {
+		/*
+                 * Check for both sizes that are too large, and for sampled /
+                 * special aligned objects.  The alignment check will also check
+                 * for null ptr.
+                 */
+		if (unlikely(size > SC_LOOKUP_MAXCLASS
+		        || free_fastpath_nonfast_aligned(ptr,
+		            /* check_prof */ true))) {
+			return false;
+		}
+		sz_size2index_usize_fastpath(size, &alloc_ctx.szind, &usize);
+		/* Max lookup class must be small. */
+		assert(alloc_ctx.szind < SC_NBINS);
+		/* This is a dead store, except when opt size checking is on. */
+		alloc_ctx.slab = true;
+	}
+	/*
+         * Currently the fastpath only handles small sizes.  The branch on
+         * SC_LOOKUP_MAXCLASS makes sure of it.  This lets us avoid checking
+         * tcache szind upper limit (i.e. tcache_max) as well.
+         */
+	assert(alloc_ctx.slab);
+
+	uint64_t deallocated, threshold;
+	te_free_fastpath_ctx(tsd, &deallocated, &threshold);
+
+	uint64_t deallocated_after = deallocated + usize;
+	/*
+         * Check for events and tsd non-nominal (fast_threshold will be set to
+         * 0) in a single branch.  Note that this handles the uninitialized case
+         * as well (TSD init will be triggered on the non-fastpath).  Therefore
+         * anything depends on a functional TSD (e.g. the alloc_ctx sanity check
+         * below) needs to be after this branch.
+         */
+	if (unlikely(deallocated_after >= threshold)) {
+		return false;
+	}
+	assert(tsd_fast(tsd));
+	bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
+	if (fail) {
+		/* See the comment in isfree. */
+		return true;
+	}
+
+	tcache_t    *tcache = tcache_get_from_ind(tsd, TCACHE_IND_AUTOMATIC,
+	       /* slow */ false, /* is_alloc */ false);
+	cache_bin_t *bin = &tcache->bins[alloc_ctx.szind];
+
+	/*
+         * If junking were enabled, this is where we would do it.  It's not
+         * though, since we ensured above that we're on the fast path.  Assert
+         * that to double-check.
+         */
+	assert(!opt_junk_free);
+
+	if (!cache_bin_dalloc_easy(bin, ptr)) {
+		return false;
+	}
+
+	*tsd_thread_deallocatedp_get(tsd) = deallocated_after;
+
+	return true;
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_noflags(void *ptr, size_t size) {
+	if (!free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, 0);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_sdallocx_impl(void *ptr, size_t size, int flags) {
+	if (flags != 0 || !free_fastpath(ptr, size, true)) {
+		sdallocx_default(ptr, size, flags);
+	}
+}
+
+JEMALLOC_ALWAYS_INLINE void JEMALLOC_NOTHROW
+je_free_impl(void *ptr) {
+	if (!free_fastpath(ptr, 0, false)) {
+		free_default(ptr);
+	}
+}
+
 #endif /* JEMALLOC_INTERNAL_INLINES_C_H */
--- a/include/jemalloc/internal/jemalloc_internal_macros.h
+++ b/include/jemalloc/internal/jemalloc_internal_macros.h
@ -2,43 +2,46 @@
 #define JEMALLOC_INTERNAL_MACROS_H

 #ifdef JEMALLOC_DEBUG
-#  define JEMALLOC_ALWAYS_INLINE static inline
+#	define JEMALLOC_ALWAYS_INLINE static inline
 #else
-#  ifdef _MSC_VER
-#    define JEMALLOC_ALWAYS_INLINE static __forceinline
-#  else
-#    define JEMALLOC_ALWAYS_INLINE JEMALLOC_ATTR(always_inline) static inline
-#  endif
+#	ifdef _MSC_VER
+#		define JEMALLOC_ALWAYS_INLINE static __forceinline
+#	else
+#		define JEMALLOC_ALWAYS_INLINE                                 \
+			JEMALLOC_ATTR(always_inline) static inline
+#	endif
 #endif
 #ifdef _MSC_VER
-#  define inline _inline
+#	define inline _inline
 #endif

 #define UNUSED JEMALLOC_ATTR(unused)

-#define ZU(z)	((size_t)z)
-#define ZD(z)	((ssize_t)z)
-#define QU(q)	((uint64_t)q)
-#define QD(q)	((int64_t)q)
+#define ZU(z) ((size_t)z)
+#define ZD(z) ((ssize_t)z)
+#define QU(q) ((uint64_t)q)
+#define QD(q) ((int64_t)q)

-#define KZU(z)	ZU(z##ULL)
-#define KZD(z)	ZD(z##LL)
-#define KQU(q)	QU(q##ULL)
-#define KQD(q)	QI(q##LL)
+#define KZU(z) ZU(z##ULL)
+#define KZD(z) ZD(z##LL)
+#define KQU(q) QU(q##ULL)
+#define KQD(q) QI(q##LL)

 #ifndef __DECONST
-#  define	__DECONST(type, var)	((type)(uintptr_t)(const void *)(var))
+#	define __DECONST(type, var) ((type)(uintptr_t)(const void *)(var))
 #endif

 #if !defined(JEMALLOC_HAS_RESTRICT) || defined(__cplusplus)
-#  define restrict
+#	define restrict
 #endif

 /* Various function pointers are static and immutable except during testing. */
 #ifdef JEMALLOC_JET
-#  define JET_MUTABLE
+#	define JET_MUTABLE
+#	define JET_EXTERN extern
 #else
-#  define JET_MUTABLE const
+#	define JET_MUTABLE const
+#	define JET_EXTERN static
 #endif

 #define JEMALLOC_VA_ARGS_HEAD(head, ...) head
@ -46,62 +49,94 @@

 /* Diagnostic suppression macros */
 #if defined(_MSC_VER) && !defined(__clang__)
-#  define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
-#  define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
-#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable:W))
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
-#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#	define JEMALLOC_DIAGNOSTIC_PUSH __pragma(warning(push))
+#	define JEMALLOC_DIAGNOSTIC_POP __pragma(warning(pop))
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W) __pragma(warning(disable : W))
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 /* #pragma GCC diagnostic first appeared in gcc 4.6. */
-#elif (defined(__GNUC__) && ((__GNUC__ > 4) || ((__GNUC__ == 4) && \
-  (__GNUC_MINOR__ > 5)))) || defined(__clang__)
+#elif (defined(__GNUC__)                                                       \
+    && ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 5))))          \
+    || defined(__clang__)
 /*
 * The JEMALLOC_PRAGMA__ macro is an implementation detail of the GCC and Clang
 * diagnostic suppression macros and should not be used anywhere else.
 */
-#  define JEMALLOC_PRAGMA__(X) _Pragma(#X)
-#  define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
-#  define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
-#  define JEMALLOC_DIAGNOSTIC_IGNORE(W) \
-     JEMALLOC_PRAGMA__(GCC diagnostic ignored W)
+#	define JEMALLOC_PRAGMA__(X) _Pragma(#X)
+#	define JEMALLOC_DIAGNOSTIC_PUSH JEMALLOC_PRAGMA__(GCC diagnostic push)
+#	define JEMALLOC_DIAGNOSTIC_POP JEMALLOC_PRAGMA__(GCC diagnostic pop)
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W)                                  \
+		JEMALLOC_PRAGMA__(GCC diagnostic ignored W)

 /*
 * The -Wmissing-field-initializers warning is buggy in GCC versions < 5.1 and
 * all clang versions up to version 7 (currently trunk, unreleased).  This macro
 * suppresses the warning for the affected compiler versions only.
 */
-#  if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5)) || \
-     defined(__clang__)
-#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS  \
-          JEMALLOC_DIAGNOSTIC_IGNORE("-Wmissing-field-initializers")
-#  else
-#    define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
-#  endif
+#	if ((defined(__GNUC__) && !defined(__clang__)) && (__GNUC__ < 5))     \
+	    || defined(__clang__)
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS \
+			JEMALLOC_DIAGNOSTIC_IGNORE(                                  \
+			    "-Wmissing-field-initializers")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	endif

-#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS  \
-     JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER \
-     JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
-#  if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
-#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN \
-       JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
-#  else
-#    define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
-#  endif
-#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS \
-  JEMALLOC_DIAGNOSTIC_PUSH \
-  JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS                       \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wframe-address")
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS                         \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wtype-limits")
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER                    \
+		JEMALLOC_DIAGNOSTIC_IGNORE("-Wunused-parameter")
+#	if defined(__GNUC__) && !defined(__clang__) && (__GNUC__ >= 7)
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN      \
+			JEMALLOC_DIAGNOSTIC_IGNORE("-Walloc-size-larger-than=")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	endif
+#	ifdef JEMALLOC_HAVE_ATTR_DEPRECATED
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED                  \
+			JEMALLOC_DIAGNOSTIC_IGNORE("-Wdeprecated-declarations")
+#	else
+#		define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	endif
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS                           \
+		JEMALLOC_DIAGNOSTIC_PUSH                                       \
+		JEMALLOC_DIAGNOSTIC_IGNORE_UNUSED_PARAMETER
 #else
-#  define JEMALLOC_DIAGNOSTIC_PUSH
-#  define JEMALLOC_DIAGNOSTIC_POP
-#  define JEMALLOC_DIAGNOSTIC_IGNORE(W)
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
-#  define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
-#  define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
+#	define JEMALLOC_DIAGNOSTIC_PUSH
+#	define JEMALLOC_DIAGNOSTIC_POP
+#	define JEMALLOC_DIAGNOSTIC_IGNORE(W)
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_MISSING_STRUCT_FIELD_INITIALIZERS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_FRAME_ADDRESS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_TYPE_LIMITS
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_ALLOC_SIZE_LARGER_THAN
+#	define JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED
+#	define JEMALLOC_DIAGNOSTIC_DISABLE_SPURIOUS
 #endif

+#ifdef __clang_analyzer__
+#	define JEMALLOC_CLANG_ANALYZER
+#endif
+
+#ifdef JEMALLOC_CLANG_ANALYZER
+#	define JEMALLOC_CLANG_ANALYZER_SUPPRESS __attribute__((suppress))
+#	define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v) = v
+#else
+#	define JEMALLOC_CLANG_ANALYZER_SUPPRESS
+#	define JEMALLOC_CLANG_ANALYZER_SILENCE_INIT(v)
+#endif
+
+#define JEMALLOC_SUPPRESS_WARN_ON_USAGE(...)                                   \
+	JEMALLOC_DIAGNOSTIC_PUSH                                               \
+	JEMALLOC_DIAGNOSTIC_IGNORE_DEPRECATED                                  \
+	__VA_ARGS__                                                            \
+	JEMALLOC_DIAGNOSTIC_POP
+
 /*
 * Disables spurious diagnostics for all headers.  Since these headers are not
 * included by users directly, it does not affect their diagnostic settings.
--- a/include/jemalloc/internal/jemalloc_internal_overrides.h
+++ b/include/jemalloc/internal/jemalloc_internal_overrides.h
@ -0,0 +1,22 @@
+#ifndef JEMALLOC_INTERNAL_OVERRIDES_H
+#define JEMALLOC_INTERNAL_OVERRIDES_H
+
+/*
+ * Under normal circumstances this header serves no purpose, as these settings
+ * can be customized via the corresponding autoconf options at configure-time.
+ * Overriding in this fashion is useful when the header files generated by
+ * autoconf are used as input for another build system.
+ */
+
+#ifdef JEMALLOC_OVERRIDE_LG_PAGE
+#	undef LG_PAGE
+#	define LG_PAGE JEMALLOC_OVERRIDE_LG_PAGE
+#endif
+
+#ifdef JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+#	undef JEMALLOC_CONFIG_MALLOC_CONF
+#	define JEMALLOC_CONFIG_MALLOC_CONF                                    \
+		JEMALLOC_OVERRIDE_JEMALLOC_CONFIG_MALLOC_CONF
+#endif
+
+#endif /* JEMALLOC_INTERNAL_OVERRIDES_H */
--- a/include/jemalloc/internal/jemalloc_internal_types.h
+++ b/include/jemalloc/internal/jemalloc_internal_types.h
@ -18,13 +18,13 @@ enum zero_realloc_action_e {
 typedef enum zero_realloc_action_e zero_realloc_action_t;

 /* Signature of write callback. */
-typedef void (write_cb_t)(void *, const char *);
+typedef void(write_cb_t)(void *, const char *);

 enum malloc_init_e {
-	malloc_init_uninitialized	= 3,
-	malloc_init_a0_initialized	= 2,
-	malloc_init_recursible		= 1,
-	malloc_init_initialized		= 0 /* Common case --> jnz. */
+	malloc_init_uninitialized = 3,
+	malloc_init_a0_initialized = 2,
+	malloc_init_recursible = 1,
+	malloc_init_initialized = 0 /* Common case --> jnz. */
 };
 typedef enum malloc_init_e malloc_init_t;

@ -39,48 +39,46 @@ typedef enum malloc_init_e malloc_init_t;
 *
 * aaaaaaaa aaaatttt tttttttt 0znnnnnn
 */
-#define MALLOCX_ARENA_BITS	12
-#define MALLOCX_TCACHE_BITS	12
-#define MALLOCX_LG_ALIGN_BITS	6
-#define MALLOCX_ARENA_SHIFT	20
-#define MALLOCX_TCACHE_SHIFT	8
-#define MALLOCX_ARENA_MASK \
-    (((1 << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT)
+#define MALLOCX_ARENA_BITS 12
+#define MALLOCX_TCACHE_BITS 12
+#define MALLOCX_LG_ALIGN_BITS 6
+#define MALLOCX_ARENA_SHIFT 20
+#define MALLOCX_TCACHE_SHIFT 8
+#define MALLOCX_ARENA_MASK                                                     \
+	((unsigned)(((1U << MALLOCX_ARENA_BITS) - 1) << MALLOCX_ARENA_SHIFT))
 /* NB: Arena index bias decreases the maximum number of arenas by 1. */
-#define MALLOCX_ARENA_LIMIT	((1 << MALLOCX_ARENA_BITS) - 1)
-#define MALLOCX_TCACHE_MASK \
-    (((1 << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT)
-#define MALLOCX_TCACHE_MAX	((1 << MALLOCX_TCACHE_BITS) - 3)
-#define MALLOCX_LG_ALIGN_MASK	((1 << MALLOCX_LG_ALIGN_BITS) - 1)
+#define MALLOCX_ARENA_LIMIT ((unsigned)((1U << MALLOCX_ARENA_BITS) - 1))
+#define MALLOCX_TCACHE_MASK                                                    \
+	((unsigned)(((1U << MALLOCX_TCACHE_BITS) - 1) << MALLOCX_TCACHE_SHIFT))
+#define MALLOCX_TCACHE_MAX ((unsigned)((1U << MALLOCX_TCACHE_BITS) - 3))
+#define MALLOCX_LG_ALIGN_MASK ((1 << MALLOCX_LG_ALIGN_BITS) - 1)
 /* Use MALLOCX_ALIGN_GET() if alignment may not be specified in flags. */
-#define MALLOCX_ALIGN_GET_SPECIFIED(flags)				\
-    (ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
-#define MALLOCX_ALIGN_GET(flags)					\
-    (MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX-1))
-#define MALLOCX_ZERO_GET(flags)						\
-    ((bool)(flags & MALLOCX_ZERO))
+#define MALLOCX_ALIGN_GET_SPECIFIED(flags)                                     \
+	(ZU(1) << (flags & MALLOCX_LG_ALIGN_MASK))
+#define MALLOCX_ALIGN_GET(flags)                                               \
+	(MALLOCX_ALIGN_GET_SPECIFIED(flags) & (SIZE_T_MAX - 1))
+#define MALLOCX_ZERO_GET(flags) ((bool)(flags & MALLOCX_ZERO))

-#define MALLOCX_TCACHE_GET(flags)					\
-    (((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT)) - 2)
-#define MALLOCX_ARENA_GET(flags)					\
-    (((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1)
+#define MALLOCX_TCACHE_GET(flags)                                              \
+	(((unsigned)((flags & MALLOCX_TCACHE_MASK) >> MALLOCX_TCACHE_SHIFT))   \
+	    - 2)
+#define MALLOCX_ARENA_GET(flags)                                               \
+	(((unsigned)(((unsigned)flags) >> MALLOCX_ARENA_SHIFT)) - 1)

 /* Smallest size class to support. */
-#define TINY_MIN		(1U << LG_TINY_MIN)
+#define TINY_MIN (1U << LG_TINY_MIN)

-#define LONG			((size_t)(1U << LG_SIZEOF_LONG))
-#define LONG_MASK		(LONG - 1)
+#define LONG ((size_t)(1U << LG_SIZEOF_LONG))
+#define LONG_MASK (LONG - 1)

 /* Return the smallest long multiple that is >= a. */
-#define LONG_CEILING(a)							\
-	(((a) + LONG_MASK) & ~LONG_MASK)
+#define LONG_CEILING(a) (((a) + LONG_MASK) & ~LONG_MASK)

-#define SIZEOF_PTR		(1U << LG_SIZEOF_PTR)
-#define PTR_MASK		(SIZEOF_PTR - 1)
+#define SIZEOF_PTR (1U << LG_SIZEOF_PTR)
+#define PTR_MASK (SIZEOF_PTR - 1)

 /* Return the smallest (void *) multiple that is >= a. */
-#define PTR_CEILING(a)							\
-	(((a) + PTR_MASK) & ~PTR_MASK)
+#define PTR_CEILING(a) (((a) + PTR_MASK) & ~PTR_MASK)

 /*
 * Maximum size of L1 cache line.  This is used to avoid cache line aliasing.
@ -89,42 +87,62 @@ typedef enum malloc_init_e malloc_init_t;
 * CACHELINE cannot be based on LG_CACHELINE because __declspec(align()) can
 * only handle raw constants.
 */
-#define LG_CACHELINE		6
-#define CACHELINE		64
-#define CACHELINE_MASK		(CACHELINE - 1)
+#define LG_CACHELINE 6
+#define CACHELINE 64
+#define CACHELINE_MASK (CACHELINE - 1)

 /* Return the smallest cacheline multiple that is >= s. */
-#define CACHELINE_CEILING(s)						\
-	(((s) + CACHELINE_MASK) & ~CACHELINE_MASK)
+#define CACHELINE_CEILING(s) (((s) + CACHELINE_MASK) & ~CACHELINE_MASK)

 /* Return the nearest aligned address at or below a. */
-#define ALIGNMENT_ADDR2BASE(a, alignment)				\
-	((void *)((uintptr_t)(a) & ((~(alignment)) + 1)))
+#define ALIGNMENT_ADDR2BASE(a, alignment)                                      \
+	((void *)(((byte_t *)(a))                                              \
+	    - (((uintptr_t)(a)) - ((uintptr_t)(a) & ((~(alignment)) + 1)))))

 /* Return the offset between a and the nearest aligned address at or below a. */
-#define ALIGNMENT_ADDR2OFFSET(a, alignment)				\
+#define ALIGNMENT_ADDR2OFFSET(a, alignment)                                    \
 	((size_t)((uintptr_t)(a) & (alignment - 1)))

 /* Return the smallest alignment multiple that is >= s. */
-#define ALIGNMENT_CEILING(s, alignment)					\
+#define ALIGNMENT_CEILING(s, alignment)                                        \
 	(((s) + (alignment - 1)) & ((~(alignment)) + 1))

+/*
+ * Return the nearest aligned address at or above a.
+ *
+ * While at first glance this would appear to be merely a more complicated
+ * way to perform the same computation as `ALIGNMENT_CEILING`,
+ * this has the important additional property of not concealing pointer
+ * provenance from the compiler. See the block-comment on the
+ * definition of `byte_t` for more details.
+ */
+#define ALIGNMENT_ADDR2CEILING(a, alignment)                                   \
+	((void *)(((byte_t *)(a))                                              \
+	    + (((((uintptr_t)(a)) + (alignment - 1)) & ((~(alignment)) + 1))   \
+	        - ((uintptr_t)(a)))))
+
 /* Declare a variable-length array. */
-#if __STDC_VERSION__ < 199901L
-#  ifdef _MSC_VER
-#    include <malloc.h>
-#    define alloca _alloca
-#  else
-#    ifdef JEMALLOC_HAS_ALLOCA_H
-#      include <alloca.h>
-#    else
-#      include <stdlib.h>
-#    endif
-#  endif
-#  define VARIABLE_ARRAY(type, name, count) \
-	type *name = alloca(sizeof(type) * (count))
+#if __STDC_VERSION__ < 199901L || defined(__STDC_NO_VLA__)
+#	ifdef _MSC_VER
+#		include <malloc.h>
+#		define alloca _alloca
+#	else
+#		ifdef JEMALLOC_HAS_ALLOCA_H
+#			include <alloca.h>
+#		else
+#			include <stdlib.h>
+#		endif
+#	endif
+#	define VARIABLE_ARRAY_UNSAFE(type, name, count)                       \
+		type *name = alloca(sizeof(type) * (count))
 #else
-#  define VARIABLE_ARRAY(type, name, count) type name[(count)]
+#	define VARIABLE_ARRAY_UNSAFE(type, name, count) type name[(count)]
 #endif
+#define VARIABLE_ARRAY_SIZE_MAX 2048
+#define VARIABLE_ARRAY(type, name, count)                                      \
+	assert(sizeof(type) * (count) <= VARIABLE_ARRAY_SIZE_MAX);             \
+	VARIABLE_ARRAY_UNSAFE(type, name, count)
+
+#define CALLOC_MADVISE_THRESHOLD_DEFAULT (((size_t)1) << 23) /* 8 MB */

 #endif /* JEMALLOC_INTERNAL_TYPES_H */
--- a/include/jemalloc/internal/jemalloc_preamble.h.in
+++ b/include/jemalloc/internal/jemalloc_preamble.h.in
@ -1,7 +1,7 @@
 #ifndef JEMALLOC_PREAMBLE_H
 #define JEMALLOC_PREAMBLE_H

-#include "jemalloc_internal_defs.h"
+#include "jemalloc/internal/jemalloc_internal_defs.h"
 #include "jemalloc/internal/jemalloc_internal_decls.h"

 #if defined(JEMALLOC_UTRACE) || defined(JEMALLOC_UTRACE_LABEL)
@ -57,6 +57,15 @@
 #  define JEMALLOC_MADV_FREE 8
 #endif

+/*
+ * Can be defined at compile time, in cases, when it is known
+ * madvise(..., MADV_COLLAPSE) feature is supported, but MADV_COLLAPSE
+ * constant is not defined.
+ */
+#ifdef JEMALLOC_DEFINE_MADVISE_COLLAPSE
+#  define JEMALLOC_MADV_COLLAPSE 25
+#endif
+
 static const bool config_debug =
 #ifdef JEMALLOC_DEBUG
    true
@ -78,6 +87,13 @@ static const bool have_madvise_huge =
    false
 #endif
    ;
+static const bool have_process_madvise =
+#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
+    true
+#else
+    false
+#endif
+    ;
 static const bool config_fill =
 #ifdef JEMALLOC_FILL
    true
@ -114,6 +130,13 @@ static const bool config_prof_libunwind =
    false
 #endif
    ;
+static const bool config_prof_frameptr =
+#ifdef JEMALLOC_PROF_FRAME_POINTER
+    true
+#else
+    false
+#endif
+    ;
 static const bool maps_coalesce =
 #ifdef JEMALLOC_MAPS_COALESCE
    true
@ -215,7 +238,7 @@ static const bool config_enable_cxx =
 #endif
 ;

-#if defined(_WIN32) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
+#if defined(_WIN32) || defined(__APPLE__) || defined(JEMALLOC_HAVE_SCHED_GETCPU)
 /* Currently percpu_arena depends on sched_getcpu. */
 #define JEMALLOC_PERCPU_ARENA
 #endif
--- a/include/jemalloc/internal/jemalloc_probe.h
+++ b/include/jemalloc/internal/jemalloc_probe.h
@ -0,0 +1,49 @@
+#ifndef JEMALLOC_INTERNAL_JEMALLOC_PROBE_H
+#define JEMALLOC_INTERNAL_JEMALLOC_PROBE_H
+
+#include <jemalloc/internal/jemalloc_preamble.h>
+
+#ifdef JEMALLOC_EXPERIMENTAL_USDT_STAP
+#include <jemalloc/internal/jemalloc_probe_stap.h>
+#elif defined(JEMALLOC_EXPERIMENTAL_USDT_CUSTOM)
+#include <jemalloc/internal/jemalloc_probe_custom.h>
+#elif defined(_MSC_VER)
+#define JE_USDT(name, N, ...) /* Nothing */
+#else /*  no USDT, just check the args */
+
+#define JE_USDT(name, N, ...) _JE_USDT_CHECK_ARG##N(__VA_ARGS__)
+
+#define _JE_USDT_CHECK_ARG1(a)						\
+	do {								\
+		(void)(a);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG2(a, b)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG3(a, b, c)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG4(a, b, c, d)					\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+		(void)(d);						\
+	} while (0)
+#define _JE_USDT_CHECK_ARG5(a, b, c, d, e)				\
+	do {								\
+		(void)(a);						\
+		(void)(b);						\
+		(void)(c);						\
+		(void)(d);						\
+		(void)(e);						\
+	} while (0)
+
+#endif /* JEMALLOC_EXPERIMENTAL_USDT_* */
+
+#endif /* JEMALLOC_INTERNAL_JEMALLOC_PROBE_H */
--- a/include/jemalloc/internal/jemalloc_probe_custom.h
+++ b/include/jemalloc/internal/jemalloc_probe_custom.h
@ -0,0 +1,148 @@
+#ifndef JEMALLOC_INTERNAL_JEMALLOC_PROBE_CUSTOM_H
+#define JEMALLOC_INTERNAL_JEMALLOC_PROBE_CUSTOM_H
+
+/* clang-format off */
+
+/*
+ * This section is based on sys/sdt.h and
+ * https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
+ */
+
+/* Emit NOP for the probe. */
+#if (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__) || \
+     defined(__arm__)) && defined(__linux__)
+#define JE_SDT_NOP nop
+#else
+#error "Architecture not supported"
+#endif
+
+/* Assembly macros */
+#define JE_SDT_S(x) #x
+
+#define JE_SDT_ASM_1(x) JE_SDT_S(x) "\n"
+
+#define JE_SDT_ASM_2(x, y)			\
+	JE_SDT_S(x) "," JE_SDT_S(y) "\n"
+
+#define JE_SDT_ASM_3(x, y, z)					\
+	JE_SDT_S(x) "," JE_SDT_S(y) ","  JE_SDT_S(z) "\n"
+
+#define JE_SDT_ASM_3(x, y, z)					\
+	JE_SDT_S(x) "," JE_SDT_S(y) ","  JE_SDT_S(z) "\n"
+
+#define JE_SDT_ASM_4(x, y, z, p)					\
+	JE_SDT_S(x) "," JE_SDT_S(y) "," JE_SDT_S(z) "," JE_SDT_S(p) "\n"
+
+#define JE_SDT_ASM_5(x, y, z, p, q)					\
+	JE_SDT_S(x) "," JE_SDT_S(y) ","	JE_SDT_S(z) "," JE_SDT_S(p) ","	\
+		JE_SDT_S(q) "\n"
+
+/* Arg size */
+#ifdef __LP64__
+#define JE_SDT_ASM_ADDR            .8byte
+#else
+#define JE_SDT_ASM_ADDR            .4byte
+#endif
+
+#define JE_SDT_NOTE_NAME  "stapsdt"
+#define JE_SDT_NOTE_TYPE  3
+
+#define JE_SDT_SEMAPHORE_NONE(provider, name)			\
+	JE_SDT_ASM_1(JE_SDT_ASM_ADDR 0) /* No Semaphore support */
+#define JE_SDT_SEMAPHORE_OPERAND(provider, name)	\
+	[__sdt_semaphore] "ip" (0) /* No Semaphore */
+
+#define JE_SDT_ASM_STRING(x)     JE_SDT_ASM_1(.asciz JE_SDT_S(x))
+
+#define JE_SDT_NOTE(provider, name, arg_template)			\
+	JE_SDT_ASM_1(990: JE_SDT_NOP)					\
+	JE_SDT_ASM_3(     .pushsection .note.stapsdt,"?","note")	\
+	JE_SDT_ASM_1(     .balign 4)					\
+	JE_SDT_ASM_3(     .4byte 992f-991f, 994f-993f, JE_SDT_NOTE_TYPE) \
+	JE_SDT_ASM_1(991: .asciz JE_SDT_NOTE_NAME)			\
+	JE_SDT_ASM_1(992: .balign 4)					\
+	JE_SDT_ASM_1(993: JE_SDT_ASM_ADDR 990b)				\
+	JE_SDT_ASM_1(     JE_SDT_ASM_ADDR _.stapsdt.base)		\
+	JE_SDT_SEMAPHORE_NONE(provider, name)				\
+	JE_SDT_ASM_STRING(provider)					\
+	JE_SDT_ASM_STRING(name)						\
+	JE_SDT_ASM_STRING(arg_template)					\
+	JE_SDT_ASM_1(994: .balign 4)					\
+	JE_SDT_ASM_1(     .popsection)
+
+#define JE_SDT_BASE							\
+	JE_SDT_ASM_1(     .ifndef _.stapsdt.base)			\
+	JE_SDT_ASM_5(     .pushsection .stapsdt.base, "aG", "progbits",	\
+		    .stapsdt.base,comdat)				\
+	JE_SDT_ASM_1(     .weak _.stapsdt.base)				\
+	JE_SDT_ASM_1(     .hidden _.stapsdt.base)			\
+	JE_SDT_ASM_1(     _.stapsdt.base: .space 1)			\
+	JE_SDT_ASM_2(     .size _.stapsdt.base, 1)			\
+	JE_SDT_ASM_1(     .popsection)					\
+	JE_SDT_ASM_1(     .endif)
+
+
+/*
+ * Default constraint for probes arguments.
+ * See https://gcc.gnu.org/onlinedocs/gcc/Constraints.html
+ */
+#ifndef JE_SDT_ARG_CONSTRAINT
+#define JE_SDT_ARG_CONSTRAINT      "nor"
+#endif
+
+#define JE_SDT_ARGARRAY(x)  ((__builtin_classify_type(x) == 14) ||  \
+			     (__builtin_classify_type(x) == 5))
+#define JE_SDT_ARGSIZE(x)   (JE_SDT_ARGARRAY(x) ? sizeof(void*) : sizeof(x))
+
+/*
+ * Format of each probe argument as operand.  Size tagged with JE_SDT_Sn,
+ * with "n" constraint.  Value is tagged with JE_SDT_An with configured
+ * constraint.
+ */
+#define JE_SDT_ARG(n, x)						\
+	[JE_SDT_S##n] "n"                ((size_t)JE_SDT_ARGSIZE(x)),	\
+		[JE_SDT_A##n] JE_SDT_ARG_CONSTRAINT(x)
+
+/* Templates to append arguments as operands. */
+#define JE_SDT_OPERANDS_0()     [__sdt_dummy] "g" (0)
+#define JE_SDT_OPERANDS_1(_1)      JE_SDT_ARG(1, _1)
+#define JE_SDT_OPERANDS_2(_1, _2)  JE_SDT_OPERANDS_1(_1), JE_SDT_ARG(2, _2)
+#define JE_SDT_OPERANDS_3(_1, _2, _3) JE_SDT_OPERANDS_2(_1, _2), JE_SDT_ARG(3, _3)
+#define JE_SDT_OPERANDS_4(_1, _2, _3, _4)			\
+	JE_SDT_OPERANDS_3(_1, _2, _3), JE_SDT_ARG(4, _4)
+#define JE_SDT_OPERANDS_5(_1, _2, _3, _4, _5)			\
+	JE_SDT_OPERANDS_4(_1, _2, _3, _4), JE_SDT_ARG(5, _5)
+#define JE_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6)			\
+	JE_SDT_OPERANDS_5(_1, _2, _3, _4, _5), JE_SDT_ARG(6, _6)
+#define JE_SDT_OPERANDS_7(_1, _2, _3, _4, _5, _6, _7)		\
+	JE_SDT_OPERANDS_6(_1, _2, _3, _4, _5, _6), JE_SDT_ARG(7, _7)
+
+/* Templates to reference the arguments from operands. */
+#define JE_SDT_ARGFMT(num)        %n[JE_SDT_S##num]@%[JE_SDT_A##num]
+#define JE_SDT_ARG_TEMPLATE_0    /* No args */
+#define JE_SDT_ARG_TEMPLATE_1    JE_SDT_ARGFMT(1)
+#define JE_SDT_ARG_TEMPLATE_2    JE_SDT_ARG_TEMPLATE_1 JE_SDT_ARGFMT(2)
+#define JE_SDT_ARG_TEMPLATE_3    JE_SDT_ARG_TEMPLATE_2 JE_SDT_ARGFMT(3)
+#define JE_SDT_ARG_TEMPLATE_4    JE_SDT_ARG_TEMPLATE_3 JE_SDT_ARGFMT(4)
+#define JE_SDT_ARG_TEMPLATE_5    JE_SDT_ARG_TEMPLATE_4 JE_SDT_ARGFMT(5)
+#define JE_SDT_ARG_TEMPLATE_6    JE_SDT_ARG_TEMPLATE_5 JE_SDT_ARGFMT(6)
+#define JE_SDT_ARG_TEMPLATE_7    JE_SDT_ARG_TEMPLATE_6 JE_SDT_ARGFMT(7)
+
+#define JE_SDT_PROBE(							\
+	provider, name, n, arglist)					\
+	do {								\
+		__asm__ __volatile__(					\
+			JE_SDT_NOTE(provider, name,			\
+				    JE_SDT_ARG_TEMPLATE_##n)		\
+			:: JE_SDT_SEMAPHORE_OPERAND(provider, name),	\
+			JE_SDT_OPERANDS_##n arglist);			\
+		__asm__ __volatile__(JE_SDT_BASE);			\
+	} while (0)
+
+#define JE_USDT(name, N, ...)						\
+  JE_SDT_PROBE(jemalloc, name, N, (__VA_ARGS__))
+
+
+#endif /* JEMALLOC_INTERNAL_JEMALLOC_PROBE_CUSTOM_H */
+
+/* clang-format on */
--- a/include/jemalloc/internal/jemalloc_probe_stap.h
+++ b/include/jemalloc/internal/jemalloc_probe_stap.h
@ -0,0 +1,11 @@
+#ifndef JEMALLOC_INTERNAL_JEMALLOC_PROBE_STAP_H
+#define JEMALLOC_INTERNAL_JEMALLOC_PROBE_STAP_H
+
+#include <sys/sdt.h>
+
+#define JE_USDT(name, N, ...) JE_USDT_PROBE_N(name, N, ##__VA_ARGS__)
+
+#define JE_USDT_PROBE_N(name, N, ...)                                          \
+	STAP_PROBE##N(jemalloc, name, ##__VA_ARGS__)
+
+#endif /* JEMALLOC_INTERNAL_JEMALLOC_PROBE_STAP_H */
--- a/include/jemalloc/internal/large_externs.h
+++ b/include/jemalloc/internal/large_externs.h
@ -1,23 +1,24 @@
 #ifndef JEMALLOC_INTERNAL_LARGE_EXTERNS_H
 #define JEMALLOC_INTERNAL_LARGE_EXTERNS_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/edata.h"
 #include "jemalloc/internal/hook.h"

 void *large_malloc(tsdn_t *tsdn, arena_t *arena, size_t usize, bool zero);
-void *large_palloc(tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment,
-    bool zero);
-bool large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min,
-    size_t usize_max, bool zero);
+void *large_palloc(
+    tsdn_t *tsdn, arena_t *arena, size_t usize, size_t alignment, bool zero);
+bool  large_ralloc_no_move(tsdn_t *tsdn, edata_t *edata, size_t usize_min,
+     size_t usize_max, bool zero);
 void *large_ralloc(tsdn_t *tsdn, arena_t *arena, void *ptr, size_t usize,
    size_t alignment, bool zero, tcache_t *tcache,
    hook_ralloc_args_t *hook_args);

-void large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata);
-void large_dalloc_finish(tsdn_t *tsdn, edata_t *edata);
-void large_dalloc(tsdn_t *tsdn, edata_t *edata);
-size_t large_salloc(tsdn_t *tsdn, const edata_t *edata);
-void large_prof_info_get(tsd_t *tsd, edata_t *edata, prof_info_t *prof_info,
-    bool reset_recent);
+void   large_dalloc_prep_locked(tsdn_t *tsdn, edata_t *edata);
+void   large_dalloc_finish(tsdn_t *tsdn, edata_t *edata);
+void   large_dalloc(tsdn_t *tsdn, edata_t *edata);
+void   large_prof_info_get(
+      tsd_t *tsd, edata_t *edata, prof_info_t *prof_info, bool reset_recent);
 void large_prof_tctx_reset(edata_t *edata);
 void large_prof_info_set(edata_t *edata, prof_tctx_t *tctx, size_t size);

--- a/include/jemalloc/internal/lockedint.h
+++ b/include/jemalloc/internal/lockedint.h
@ -1,6 +1,11 @@
 #ifndef JEMALLOC_INTERNAL_LOCKEDINT_H
 #define JEMALLOC_INTERNAL_LOCKEDINT_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/atomic.h"
+#include "jemalloc/internal/mutex.h"
+#include "jemalloc/internal/tsd_types.h"
+
 /*
 * In those architectures that support 64-bit atomics, we use atomic updates for
 * our 64-bit values.  Otherwise, we use a plain uint64_t and synchronize
@ -25,33 +30,34 @@ struct locked_zu_s {
 };

 #ifndef JEMALLOC_ATOMIC_U64
-#  define LOCKEDINT_MTX_DECLARE(name) malloc_mutex_t name;
-#  define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode)			\
-    malloc_mutex_init(&(mu), name, rank, rank_mode)
-#  define LOCKEDINT_MTX(mtx) (&(mtx))
-#  define LOCKEDINT_MTX_LOCK(tsdn, mu) malloc_mutex_lock(tsdn, &(mu))
-#  define LOCKEDINT_MTX_UNLOCK(tsdn, mu) malloc_mutex_unlock(tsdn, &(mu))
-#  define LOCKEDINT_MTX_PREFORK(tsdn, mu) malloc_mutex_prefork(tsdn, &(mu))
-#  define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)			\
-    malloc_mutex_postfork_parent(tsdn, &(mu))
-#  define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)			\
-    malloc_mutex_postfork_child(tsdn, &(mu))
+#	define LOCKEDINT_MTX_DECLARE(name) malloc_mutex_t name;
+#	define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode)                  \
+		malloc_mutex_init(&(mu), name, rank, rank_mode)
+#	define LOCKEDINT_MTX(mtx) (&(mtx))
+#	define LOCKEDINT_MTX_LOCK(tsdn, mu) malloc_mutex_lock(tsdn, &(mu))
+#	define LOCKEDINT_MTX_UNLOCK(tsdn, mu) malloc_mutex_unlock(tsdn, &(mu))
+#	define LOCKEDINT_MTX_PREFORK(tsdn, mu)                                \
+		malloc_mutex_prefork(tsdn, &(mu))
+#	define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)                        \
+		malloc_mutex_postfork_parent(tsdn, &(mu))
+#	define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)                         \
+		malloc_mutex_postfork_child(tsdn, &(mu))
 #else
-#  define LOCKEDINT_MTX_DECLARE(name)
-#  define LOCKEDINT_MTX(mtx) NULL
-#  define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) false
-#  define LOCKEDINT_MTX_LOCK(tsdn, mu)
-#  define LOCKEDINT_MTX_UNLOCK(tsdn, mu)
-#  define LOCKEDINT_MTX_PREFORK(tsdn, mu)
-#  define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)
-#  define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)
+#	define LOCKEDINT_MTX_DECLARE(name)
+#	define LOCKEDINT_MTX(mtx) NULL
+#	define LOCKEDINT_MTX_INIT(mu, name, rank, rank_mode) false
+#	define LOCKEDINT_MTX_LOCK(tsdn, mu)
+#	define LOCKEDINT_MTX_UNLOCK(tsdn, mu)
+#	define LOCKEDINT_MTX_PREFORK(tsdn, mu)
+#	define LOCKEDINT_MTX_POSTFORK_PARENT(tsdn, mu)
+#	define LOCKEDINT_MTX_POSTFORK_CHILD(tsdn, mu)
 #endif

 #ifdef JEMALLOC_ATOMIC_U64
-#  define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) assert((mtx) == NULL)
+#	define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx) assert((mtx) == NULL)
 #else
-#  define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx)			\
-    malloc_mutex_assert_owner(tsdn, (mtx))
+#	define LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx)                       \
+		malloc_mutex_assert_owner(tsdn, (mtx))
 #endif

 static inline uint64_t
@ -65,8 +71,7 @@ locked_read_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p) {
 }

 static inline void
-locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
-    uint64_t x) {
+locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, uint64_t x) {
 	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
 #ifdef JEMALLOC_ATOMIC_U64
 	atomic_fetch_add_u64(&p->val, x, ATOMIC_RELAXED);
@ -76,8 +81,7 @@ locked_inc_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
 }

 static inline void
-locked_dec_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
-    uint64_t x) {
+locked_dec_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p, uint64_t x) {
 	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
 #ifdef JEMALLOC_ATOMIC_U64
 	uint64_t r = atomic_fetch_sub_u64(&p->val, x, ATOMIC_RELAXED);
@ -94,7 +98,7 @@ locked_inc_mod_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
    const uint64_t x, const uint64_t modulus) {
 	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
 	uint64_t before, after;
-	bool overflow;
+	bool     overflow;
 #ifdef JEMALLOC_ATOMIC_U64
 	before = atomic_load_u64(&p->val, ATOMIC_RELAXED);
 	do {
@ -104,8 +108,8 @@ locked_inc_mod_u64(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_u64_t *p,
 		if (overflow) {
 			after %= modulus;
 		}
-	} while (!atomic_compare_exchange_weak_u64(&p->val, &before, after,
-	    ATOMIC_RELAXED, ATOMIC_RELAXED));
+	} while (!atomic_compare_exchange_weak_u64(
+	    &p->val, &before, after, ATOMIC_RELAXED, ATOMIC_RELAXED));
 #else
 	before = p->val;
 	after = before + x;
@ -162,8 +166,7 @@ locked_read_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p) {
 }

 static inline void
-locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p,
-    size_t x) {
+locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, size_t x) {
 	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
 #ifdef JEMALLOC_ATOMIC_U64
 	atomic_fetch_add_zu(&p->val, x, ATOMIC_RELAXED);
@ -174,8 +177,7 @@ locked_inc_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p,
 }

 static inline void
-locked_dec_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p,
-    size_t x) {
+locked_dec_zu(tsdn_t *tsdn, malloc_mutex_t *mtx, locked_zu_t *p, size_t x) {
 	LOCKEDINT_MTX_ASSERT_INTERNAL(tsdn, mtx);
 #ifdef JEMALLOC_ATOMIC_U64
 	size_t r = atomic_fetch_sub_zu(&p->val, x, ATOMIC_RELAXED);
--- a/include/jemalloc/internal/log.h
+++ b/include/jemalloc/internal/log.h
@ -1,14 +1,15 @@
 #ifndef JEMALLOC_INTERNAL_LOG_H
 #define JEMALLOC_INTERNAL_LOG_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/malloc_io.h"
 #include "jemalloc/internal/mutex.h"

 #ifdef JEMALLOC_LOG
-#  define JEMALLOC_LOG_VAR_BUFSIZE 1000
+#	define JEMALLOC_LOG_VAR_BUFSIZE 1000
 #else
-#  define JEMALLOC_LOG_VAR_BUFSIZE 1
+#	define JEMALLOC_LOG_VAR_BUFSIZE 1
 #endif

 #define JEMALLOC_LOG_BUFSIZE 4096
@ -26,16 +27,16 @@
 * log("extent.a", "log msg for extent.a"); // 5
 * log("extent.b", "log msg for extent.b"); // 6
 *
- * And your malloc_conf option is "log=arena.a|extent", then lines 2, 4, 5, and
+ * And your malloc_conf option is "log:arena.a|extent", then lines 2, 4, 5, and
 * 6 will print at runtime.  You can enable logging from all log vars by
- * writing "log=.".
+ * writing "log:.".
 *
 * None of this should be regarded as a stable API for right now.  It's intended
 * as a debugging interface, to let us keep around some of our printf-debugging
 * statements.
 */

-extern char log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
+extern char       log_var_names[JEMALLOC_LOG_VAR_BUFSIZE];
 extern atomic_b_t log_init_done;

 typedef struct log_var_s log_var_t;
@ -44,7 +45,7 @@ struct log_var_s {
 	 * Lowest bit is "inited", second lowest is "enabled".  Putting them in
 	 * a single word lets us avoid any fences on weak architectures.
 	 */
-	atomic_u_t state;
+	atomic_u_t  state;
 	const char *name;
 };

@ -52,7 +53,8 @@ struct log_var_s {
 #define LOG_INITIALIZED_NOT_ENABLED 1U
 #define LOG_ENABLED 2U

-#define LOG_VAR_INIT(name_str) {ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str}
+#define LOG_VAR_INIT(name_str)                                                 \
+	{ ATOMIC_INIT(LOG_NOT_INITIALIZED), name_str }

 /*
 * Returns the value we should assume for state (which is not necessarily
@ -62,21 +64,21 @@ struct log_var_s {
 unsigned log_var_update_state(log_var_t *log_var);

 /* We factor out the metadata management to allow us to test more easily. */
-#define log_do_begin(log_var)						\
-if (config_log) {							\
-	unsigned log_state = atomic_load_u(&(log_var).state,		\
-	    ATOMIC_RELAXED);						\
-	if (unlikely(log_state == LOG_NOT_INITIALIZED)) {		\
-		log_state = log_var_update_state(&(log_var));		\
-		assert(log_state != LOG_NOT_INITIALIZED);		\
-	}								\
-	if (log_state == LOG_ENABLED) {					\
-		{
-			/* User code executes here. */
-#define log_do_end(log_var)						\
-		}							\
-	}								\
-}
+#define log_do_begin(log_var)                                                  \
+	if (config_log) {                                                      \
+		unsigned log_state = atomic_load_u(                            \
+		    &(log_var).state, ATOMIC_RELAXED);                         \
+		if (unlikely(log_state == LOG_NOT_INITIALIZED)) {              \
+			log_state = log_var_update_state(&(log_var));          \
+			assert(log_state != LOG_NOT_INITIALIZED);              \
+		}                                                              \
+		if (log_state == LOG_ENABLED) {                                \
+			{
+/* User code executes here. */
+#define log_do_end(log_var)                                                    \
+	}                                                                      \
+	}                                                                      \
+	}

 /*
 * MSVC has some preprocessor bugs in its expansion of __VA_ARGS__ during
@ -87,29 +89,29 @@ if (config_log) {							\
 */
 static inline void
 log_impl_varargs(const char *name, ...) {
-	char buf[JEMALLOC_LOG_BUFSIZE];
+	char    buf[JEMALLOC_LOG_BUFSIZE];
 	va_list ap;

 	va_start(ap, name);
 	const char *format = va_arg(ap, const char *);
-	size_t dst_offset = 0;
+	size_t      dst_offset = 0;
 	dst_offset += malloc_snprintf(buf, JEMALLOC_LOG_BUFSIZE, "%s: ", name);
-	dst_offset += malloc_vsnprintf(buf + dst_offset,
-	    JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
-	dst_offset += malloc_snprintf(buf + dst_offset,
-	    JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
+	dst_offset += malloc_vsnprintf(
+	    buf + dst_offset, JEMALLOC_LOG_BUFSIZE - dst_offset, format, ap);
+	malloc_snprintf(
+	    buf + dst_offset, JEMALLOC_LOG_BUFSIZE - dst_offset, "\n");
 	va_end(ap);

 	malloc_write(buf);
 }

 /* Call as log("log.var.str", "format_string %d", arg_for_format_string); */
-#define LOG(log_var_str, ...)						\
-do {									\
-	static log_var_t log_var = LOG_VAR_INIT(log_var_str);		\
-	log_do_begin(log_var)						\
-		log_impl_varargs((log_var).name, __VA_ARGS__);		\
-	log_do_end(log_var)						\
-} while (0)
+#define LOG(log_var_str, ...)                                                  \
+	do {                                                                   \
+		static log_var_t log_var = LOG_VAR_INIT(log_var_str);          \
+		log_do_begin(log_var)                                          \
+		    log_impl_varargs((log_var).name, __VA_ARGS__);             \
+		log_do_end(log_var)                                            \
+	} while (0)

 #endif /* JEMALLOC_INTERNAL_LOG_H */
--- a/include/jemalloc/internal/malloc_io.h
+++ b/include/jemalloc/internal/malloc_io.h
@ -1,105 +1,101 @@
 #ifndef JEMALLOC_INTERNAL_MALLOC_IO_H
 #define JEMALLOC_INTERNAL_MALLOC_IO_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/jemalloc_internal_types.h"

 #ifdef _WIN32
-#  ifdef _WIN64
-#    define FMT64_PREFIX "ll"
-#    define FMTPTR_PREFIX "ll"
-#  else
-#    define FMT64_PREFIX "ll"
-#    define FMTPTR_PREFIX ""
-#  endif
-#  define FMTd32 "d"
-#  define FMTu32 "u"
-#  define FMTx32 "x"
-#  define FMTd64 FMT64_PREFIX "d"
-#  define FMTu64 FMT64_PREFIX "u"
-#  define FMTx64 FMT64_PREFIX "x"
-#  define FMTdPTR FMTPTR_PREFIX "d"
-#  define FMTuPTR FMTPTR_PREFIX "u"
-#  define FMTxPTR FMTPTR_PREFIX "x"
+#	ifdef _WIN64
+#		define FMT64_PREFIX "ll"
+#		define FMTPTR_PREFIX "ll"
+#	else
+#		define FMT64_PREFIX "ll"
+#		define FMTPTR_PREFIX ""
+#	endif
+#	define FMTd32 "d"
+#	define FMTu32 "u"
+#	define FMTx32 "x"
+#	define FMTd64 FMT64_PREFIX "d"
+#	define FMTu64 FMT64_PREFIX "u"
+#	define FMTx64 FMT64_PREFIX "x"
+#	define FMTdPTR FMTPTR_PREFIX "d"
+#	define FMTuPTR FMTPTR_PREFIX "u"
+#	define FMTxPTR FMTPTR_PREFIX "x"
 #else
-#  include <inttypes.h>
-#  define FMTd32 PRId32
-#  define FMTu32 PRIu32
-#  define FMTx32 PRIx32
-#  define FMTd64 PRId64
-#  define FMTu64 PRIu64
-#  define FMTx64 PRIx64
-#  define FMTdPTR PRIdPTR
-#  define FMTuPTR PRIuPTR
-#  define FMTxPTR PRIxPTR
+#	include <inttypes.h>
+#	define FMTd32 PRId32
+#	define FMTu32 PRIu32
+#	define FMTx32 PRIx32
+#	define FMTd64 PRId64
+#	define FMTu64 PRIu64
+#	define FMTx64 PRIx64
+#	define FMTdPTR PRIdPTR
+#	define FMTuPTR PRIuPTR
+#	define FMTxPTR PRIxPTR
 #endif

 /* Size of stack-allocated buffer passed to buferror(). */
-#define BUFERROR_BUF		64
+#define BUFERROR_BUF 64

 /*
 * Size of stack-allocated buffer used by malloc_{,v,vc}printf().  This must be
 * large enough for all possible uses within jemalloc.
 */
-#define MALLOC_PRINTF_BUFSIZE	4096
+#define MALLOC_PRINTF_BUFSIZE 4096

 write_cb_t wrtmessage;
-int buferror(int err, char *buf, size_t buflen);
-uintmax_t malloc_strtoumax(const char *restrict nptr, char **restrict endptr,
-    int base);
+int        buferror(int err, char *buf, size_t buflen);
+uintmax_t  malloc_strtoumax(
+     const char *restrict nptr, char **restrict endptr, int base);
 void malloc_write(const char *s);

 /*
 * malloc_vsnprintf() supports a subset of snprintf(3) that avoids floating
 * point math.
 */
-size_t malloc_vsnprintf(char *str, size_t size, const char *format,
-    va_list ap);
+size_t malloc_vsnprintf(char *str, size_t size, const char *format, va_list ap);
 size_t malloc_snprintf(char *str, size_t size, const char *format, ...)
    JEMALLOC_FORMAT_PRINTF(3, 4);
 /*
 * The caller can set write_cb to null to choose to print with the
 * je_malloc_message hook.
 */
-void malloc_vcprintf(write_cb_t *write_cb, void *cbopaque, const char *format,
-    va_list ap);
+void malloc_vcprintf(
+    write_cb_t *write_cb, void *cbopaque, const char *format, va_list ap);
 void malloc_cprintf(write_cb_t *write_cb, void *cbopaque, const char *format,
    ...) JEMALLOC_FORMAT_PRINTF(3, 4);
 void malloc_printf(const char *format, ...) JEMALLOC_FORMAT_PRINTF(1, 2);

-static inline ssize_t
-malloc_write_fd(int fd, const void *buf, size_t count) {
-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_write)
-	/*
-	 * Use syscall(2) rather than write(2) when possible in order to avoid
-	 * the possibility of memory allocation within libc.  This is necessary
-	 * on FreeBSD; most operating systems do not have this problem though.
-	 *
-	 * syscall() returns long or int, depending on platform, so capture the
-	 * result in the widest plausible type to avoid compiler warnings.
-	 */
-	long result = syscall(SYS_write, fd, buf, count);
+ssize_t malloc_write_fd(int fd, const void *buf, size_t count);
+ssize_t malloc_read_fd(int fd, void *buf, size_t count);
+
+static inline int
+malloc_open(const char *path, int flags) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_open)
+	return (int)syscall(SYS_open, path, flags);
+#elif defined(JEMALLOC_USE_SYSCALL) && defined(SYS_openat)
+	return (int)syscall(SYS_openat, AT_FDCWD, path, flags);
 #else
-	ssize_t result = (ssize_t)write(fd, buf,
-#ifdef _WIN32
-	    (unsigned int)
+	return open(path, flags);
 #endif
-	    count);
-#endif
-	return (ssize_t)result;
 }

-static inline ssize_t
-malloc_read_fd(int fd, void *buf, size_t count) {
-#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_read)
-	long result = syscall(SYS_read, fd, buf, count);
+static inline int
+malloc_close(int fd) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_close)
+	return (int)syscall(SYS_close, fd);
 #else
-	ssize_t result = read(fd, buf,
-#ifdef _WIN32
-	    (unsigned int)
+	return close(fd);
 #endif
-	    count);
+}
+
+static inline off_t
+malloc_lseek(int fd, off_t offset, int whence) {
+#if defined(JEMALLOC_USE_SYSCALL) && defined(SYS_lseek)
+	return (off_t)syscall(SYS_lseek, fd, offset, whence);
+#else
+	return lseek(fd, offset, whence);
 #endif
-	return (ssize_t)result;
 }

 #endif /* JEMALLOC_INTERNAL_MALLOC_IO_H */
--- a/include/jemalloc/internal/mpsc_queue.h
+++ b/include/jemalloc/internal/mpsc_queue.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MPSC_QUEUE_H
 #define JEMALLOC_INTERNAL_MPSC_QUEUE_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"

 /*
@ -25,6 +26,7 @@
 * two-stack tricks reverses orders in the lock-free first stack).
 */

+/* clang-format off */
 #define mpsc_queue(a_type)						\
 struct {								\
 	atomic_p_t tail;						\
@ -130,5 +132,6 @@ a_prefix##pop_batch(a_queue_type *queue, a_list_type *dst) {		\
 	}								\
 	ql_concat(dst, &reversed, a_link);				\
 }
+/* clang-format on */

 #endif /* JEMALLOC_INTERNAL_MPSC_QUEUE_H */
--- a/include/jemalloc/internal/mutex.h
+++ b/include/jemalloc/internal/mutex.h
@ -1,6 +1,7 @@
 #ifndef JEMALLOC_INTERNAL_MUTEX_H
 #define JEMALLOC_INTERNAL_MUTEX_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/mutex_prof.h"
 #include "jemalloc/internal/tsd.h"
@ -30,26 +31,29 @@ struct malloc_mutex_s {
 			 * avoid prefetching a modified cacheline (for the
 			 * unlocking thread).
 			 */
-			mutex_prof_data_t	prof_data;
-#ifdef _WIN32
-#  if _WIN32_WINNT >= 0x0600
-			SRWLOCK         	lock;
-#  else
-			CRITICAL_SECTION	lock;
-#  endif
-#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-			os_unfair_lock		lock;
-#elif (defined(JEMALLOC_MUTEX_INIT_CB))
-			pthread_mutex_t		lock;
-			malloc_mutex_t		*postponed_next;
-#else
-			pthread_mutex_t		lock;
-#endif
+			mutex_prof_data_t prof_data;
 			/*
 			 * Hint flag to avoid exclusive cache line contention
-			 * during spin waiting
+			 * during spin waiting.  Placed along with prof_data
+			 * since it's always modified even with no contention.
+			 * Modified by the lock owner only (after acquired, and
+			 * before release), and may be read by other threads.
 			 */
-			atomic_b_t		locked;
+			atomic_b_t locked;
+#ifdef _WIN32
+#	if _WIN32_WINNT >= 0x0600
+			SRWLOCK lock;
+#	else
+			CRITICAL_SECTION lock;
+#	endif
+#elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
+			os_unfair_lock lock;
+#elif (defined(JEMALLOC_MUTEX_INIT_CB))
+			pthread_mutex_t lock;
+			malloc_mutex_t *postponed_next;
+#else
+			pthread_mutex_t lock;
+#endif
 		};
 		/*
 		 * We only touch witness when configured w/ debug.  However we
@ -58,82 +62,118 @@ struct malloc_mutex_s {
 		 * memory cost.
 		 */
 #if !defined(JEMALLOC_DEBUG)
-		witness_t			witness;
-		malloc_mutex_lock_order_t	lock_order;
+		witness_t                 witness;
+		malloc_mutex_lock_order_t lock_order;
 #endif
 	};

 #if defined(JEMALLOC_DEBUG)
-	witness_t			witness;
-	malloc_mutex_lock_order_t	lock_order;
+	witness_t                 witness;
+	malloc_mutex_lock_order_t lock_order;
 #endif
 };

 #ifdef _WIN32
-#  if _WIN32_WINNT >= 0x0600
-#    define MALLOC_MUTEX_LOCK(m)    AcquireSRWLockExclusive(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  ReleaseSRWLockExclusive(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!TryAcquireSRWLockExclusive(&(m)->lock))
-#  else
-#    define MALLOC_MUTEX_LOCK(m)    EnterCriticalSection(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  LeaveCriticalSection(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!TryEnterCriticalSection(&(m)->lock))
-#  endif
+#	if _WIN32_WINNT >= 0x0600
+#		define MALLOC_MUTEX_LOCK(m) AcquireSRWLockExclusive(&(m)->lock)
+#		define MALLOC_MUTEX_UNLOCK(m)                                 \
+			ReleaseSRWLockExclusive(&(m)->lock)
+#		define MALLOC_MUTEX_TRYLOCK(m)                                \
+			(!TryAcquireSRWLockExclusive(&(m)->lock))
+#	else
+#		define MALLOC_MUTEX_LOCK(m) EnterCriticalSection(&(m)->lock)
+#		define MALLOC_MUTEX_UNLOCK(m) LeaveCriticalSection(&(m)->lock)
+#		define MALLOC_MUTEX_TRYLOCK(m)                                \
+			(!TryEnterCriticalSection(&(m)->lock))
+#	endif
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#    define MALLOC_MUTEX_LOCK(m)    os_unfair_lock_lock(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  os_unfair_lock_unlock(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
+#	define MALLOC_MUTEX_LOCK(m) os_unfair_lock_lock(&(m)->lock)
+#	define MALLOC_MUTEX_UNLOCK(m) os_unfair_lock_unlock(&(m)->lock)
+#	define MALLOC_MUTEX_TRYLOCK(m) (!os_unfair_lock_trylock(&(m)->lock))
 #else
-#    define MALLOC_MUTEX_LOCK(m)    pthread_mutex_lock(&(m)->lock)
-#    define MALLOC_MUTEX_UNLOCK(m)  pthread_mutex_unlock(&(m)->lock)
-#    define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0)
+#	define MALLOC_MUTEX_LOCK(m) pthread_mutex_lock(&(m)->lock)
+#	define MALLOC_MUTEX_UNLOCK(m) pthread_mutex_unlock(&(m)->lock)
+#	define MALLOC_MUTEX_TRYLOCK(m) (pthread_mutex_trylock(&(m)->lock) != 0)
 #endif

-#define LOCK_PROF_DATA_INITIALIZER					\
-    {NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0,		\
-	    ATOMIC_INIT(0), 0, NULL, 0}
+#define LOCK_PROF_DATA_INITIALIZER                                             \
+	{                                                                      \
+		NSTIME_ZERO_INITIALIZER, NSTIME_ZERO_INITIALIZER, 0, 0, 0,     \
+		    ATOMIC_INIT(0), 0, NULL, 0                                 \
+	}

 #ifdef _WIN32
-#  define MALLOC_MUTEX_INITIALIZER
+#	define MALLOC_MUTEX_INITIALIZER
 #elif (defined(JEMALLOC_OS_UNFAIR_LOCK))
-#  if defined(JEMALLOC_DEBUG)
-#    define MALLOC_MUTEX_INITIALIZER					\
-  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}}, \
-         WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
-#  else
-#    define MALLOC_MUTEX_INITIALIZER                      \
-  {{{LOCK_PROF_DATA_INITIALIZER, OS_UNFAIR_LOCK_INIT, ATOMIC_INIT(false)}},  \
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
-#  endif
+#	if defined(JEMALLOC_DEBUG)
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT),           \
+				    0                                          \
+			}
+#	else
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false), OS_UNFAIR_LOCK_INIT}}, \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT)            \
+			}
+#	endif
 #elif (defined(JEMALLOC_MUTEX_INIT_CB))
-#  if (defined(JEMALLOC_DEBUG))
-#     define MALLOC_MUTEX_INITIALIZER					\
-      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
-           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
-#  else
-#     define MALLOC_MUTEX_INITIALIZER					\
-      {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, NULL, ATOMIC_INIT(false)}},	\
-           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
-#  endif
+#	if (defined(JEMALLOC_DEBUG))
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false),                        \
+				    PTHREAD_MUTEX_INITIALIZER, NULL}},         \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT),           \
+				    0                                          \
+			}
+#	else
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false),                        \
+				    PTHREAD_MUTEX_INITIALIZER, NULL}},         \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT)            \
+			}
+#	endif

 #else
-#    define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
-#  if defined(JEMALLOC_DEBUG)
-#    define MALLOC_MUTEX_INITIALIZER					\
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}}, \
-           WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT), 0}
-#  else
-#    define MALLOC_MUTEX_INITIALIZER                          \
-     {{{LOCK_PROF_DATA_INITIALIZER, PTHREAD_MUTEX_INITIALIZER, ATOMIC_INIT(false)}},	\
-      WITNESS_INITIALIZER("mutex", WITNESS_RANK_OMIT)}
-#  endif
+#	define MALLOC_MUTEX_TYPE PTHREAD_MUTEX_DEFAULT
+#	if defined(JEMALLOC_DEBUG)
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false),                        \
+				    PTHREAD_MUTEX_INITIALIZER}},               \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT),           \
+				    0                                          \
+			}
+#	else
+#		define MALLOC_MUTEX_INITIALIZER                               \
+			{                                                      \
+				{{LOCK_PROF_DATA_INITIALIZER,                  \
+				    ATOMIC_INIT(false),                        \
+				    PTHREAD_MUTEX_INITIALIZER}},               \
+				    WITNESS_INITIALIZER(                       \
+				        "mutex", WITNESS_RANK_OMIT)            \
+			}
+#	endif
 #endif

 #ifdef JEMALLOC_LAZY_LOCK
 extern bool isthreaded;
 #else
-#  undef isthreaded /* Undo private_namespace.h definition. */
-#  define isthreaded true
+#	undef isthreaded /* Undo private_namespace.h definition. */
+#	define isthreaded true
 #endif

 bool malloc_mutex_init(malloc_mutex_t *mutex, const char *name,
@ -154,7 +194,12 @@ malloc_mutex_lock_final(malloc_mutex_t *mutex) {

 static inline bool
 malloc_mutex_trylock_final(malloc_mutex_t *mutex) {
-	return MALLOC_MUTEX_TRYLOCK(mutex);
+	bool failed = MALLOC_MUTEX_TRYLOCK(mutex);
+	if (!failed) {
+		atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
+	}
+
+	return failed;
 }

 static inline void
@ -169,15 +214,21 @@ mutex_owner_stats_update(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	}
 }

+static inline bool
+malloc_mutex_is_locked(malloc_mutex_t *mutex) {
+	/* Used for sanity checking only. */
+	return atomic_load_b(&mutex->locked, ATOMIC_RELAXED);
+}
+
 /* Trylock: return false if the lock is successfully acquired. */
 static inline bool
 malloc_mutex_trylock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_not_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
-			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 			return true;
 		}
+		assert(malloc_mutex_is_locked(mutex));
 		mutex_owner_stats_update(tsdn, mutex);
 	}
 	witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
@ -199,12 +250,12 @@ malloc_mutex_prof_merge(mutex_prof_data_t *sum, mutex_prof_data_t *data) {
 	if (sum->max_n_thds < data->max_n_thds) {
 		sum->max_n_thds = data->max_n_thds;
 	}
-	uint32_t cur_n_waiting_thds = atomic_load_u32(&sum->n_waiting_thds,
-	    ATOMIC_RELAXED);
-	uint32_t new_n_waiting_thds = cur_n_waiting_thds + atomic_load_u32(
-	    &data->n_waiting_thds, ATOMIC_RELAXED);
-	atomic_store_u32(&sum->n_waiting_thds, new_n_waiting_thds,
-	    ATOMIC_RELAXED);
+	uint32_t cur_n_waiting_thds = atomic_load_u32(
+	    &sum->n_waiting_thds, ATOMIC_RELAXED);
+	uint32_t new_n_waiting_thds = cur_n_waiting_thds
+	    + atomic_load_u32(&data->n_waiting_thds, ATOMIC_RELAXED);
+	atomic_store_u32(
+	    &sum->n_waiting_thds, new_n_waiting_thds, ATOMIC_RELAXED);
 	sum->n_owner_switches += data->n_owner_switches;
 	sum->n_lock_ops += data->n_lock_ops;
 }
@ -215,8 +266,8 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	if (isthreaded) {
 		if (malloc_mutex_trylock_final(mutex)) {
 			malloc_mutex_lock_slow(mutex);
-			atomic_store_b(&mutex->locked, true, ATOMIC_RELAXED);
 		}
+		assert(malloc_mutex_is_locked(mutex));
 		mutex_owner_stats_update(tsdn, mutex);
 	}
 	witness_lock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
@ -224,9 +275,10 @@ malloc_mutex_lock(tsdn_t *tsdn, malloc_mutex_t *mutex) {

 static inline void
 malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
-	atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
 	witness_unlock(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
 	if (isthreaded) {
+		assert(malloc_mutex_is_locked(mutex));
+		atomic_store_b(&mutex->locked, false, ATOMIC_RELAXED);
 		MALLOC_MUTEX_UNLOCK(mutex);
 	}
 }
@ -234,6 +286,9 @@ malloc_mutex_unlock(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 static inline void
 malloc_mutex_assert_owner(tsdn_t *tsdn, malloc_mutex_t *mutex) {
 	witness_assert_owner(tsdn_witness_tsdp_get(tsdn), &mutex->witness);
+	if (isthreaded) {
+		assert(malloc_mutex_is_locked(mutex));
+	}
 }

 static inline void
@ -255,16 +310,16 @@ malloc_mutex_prof_copy(mutex_prof_data_t *dst, mutex_prof_data_t *source) {

 /* Copy the prof data from mutex for processing. */
 static inline void
-malloc_mutex_prof_read(tsdn_t *tsdn, mutex_prof_data_t *data,
-    malloc_mutex_t *mutex) {
+malloc_mutex_prof_read(
+    tsdn_t *tsdn, mutex_prof_data_t *data, malloc_mutex_t *mutex) {
 	/* Can only read holding the mutex. */
 	malloc_mutex_assert_owner(tsdn, mutex);
 	malloc_mutex_prof_copy(data, &mutex->prof_data);
 }

 static inline void
-malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,
-    malloc_mutex_t *mutex) {
+malloc_mutex_prof_accum(
+    tsdn_t *tsdn, mutex_prof_data_t *data, malloc_mutex_t *mutex) {
 	mutex_prof_data_t *source = &mutex->prof_data;
 	/* Can only read holding the mutex. */
 	malloc_mutex_assert_owner(tsdn, mutex);
@ -286,8 +341,8 @@ malloc_mutex_prof_accum(tsdn_t *tsdn, mutex_prof_data_t *data,

 /* Compare the prof data and update to the maximum. */
 static inline void
-malloc_mutex_prof_max_update(tsdn_t *tsdn, mutex_prof_data_t *data,
-    malloc_mutex_t *mutex) {
+malloc_mutex_prof_max_update(
+    tsdn_t *tsdn, mutex_prof_data_t *data, malloc_mutex_t *mutex) {
 	mutex_prof_data_t *source = &mutex->prof_data;
 	/* Can only read holding the mutex. */
 	malloc_mutex_assert_owner(tsdn, mutex);
--- a/include/jemalloc/internal/mutex_prof.h
+++ b/include/jemalloc/internal/mutex_prof.h
@ -1,80 +1,81 @@
 #ifndef JEMALLOC_INTERNAL_MUTEX_PROF_H
 #define JEMALLOC_INTERNAL_MUTEX_PROF_H

+#include "jemalloc/internal/jemalloc_preamble.h"
 #include "jemalloc/internal/atomic.h"
 #include "jemalloc/internal/nstime.h"
 #include "jemalloc/internal/tsd_types.h"

-#define MUTEX_PROF_GLOBAL_MUTEXES					\
-    OP(background_thread)						\
-    OP(max_per_bg_thd)							\
-    OP(ctl)								\
-    OP(prof)								\
-    OP(prof_thds_data)							\
-    OP(prof_dump)							\
-    OP(prof_recent_alloc)						\
-    OP(prof_recent_dump)						\
-    OP(prof_stats)
+#define MUTEX_PROF_GLOBAL_MUTEXES                                              \
+	OP(background_thread)                                                  \
+	OP(max_per_bg_thd)                                                     \
+	OP(ctl)                                                                \
+	OP(prof)                                                               \
+	OP(prof_thds_data)                                                     \
+	OP(prof_dump)                                                          \
+	OP(prof_recent_alloc)                                                  \
+	OP(prof_recent_dump)                                                   \
+	OP(prof_stats)

 typedef enum {
 #define OP(mtx) global_prof_mutex_##mtx,
 	MUTEX_PROF_GLOBAL_MUTEXES
 #undef OP
-	mutex_prof_num_global_mutexes
+	    mutex_prof_num_global_mutexes
 } mutex_prof_global_ind_t;

-#define MUTEX_PROF_ARENA_MUTEXES					\
-    OP(large)								\
-    OP(extent_avail)							\
-    OP(extents_dirty)							\
-    OP(extents_muzzy)							\
-    OP(extents_retained)						\
-    OP(decay_dirty)							\
-    OP(decay_muzzy)							\
-    OP(base)								\
-    OP(tcache_list)							\
-    OP(hpa_shard)							\
-    OP(hpa_shard_grow)							\
-    OP(hpa_sec)
+#define MUTEX_PROF_ARENA_MUTEXES                                               \
+	OP(large)                                                              \
+	OP(extent_avail)                                                       \
+	OP(extents_dirty)                                                      \
+	OP(extents_muzzy)                                                      \
+	OP(extents_retained)                                                   \
+	OP(decay_dirty)                                                        \
+	OP(decay_muzzy)                                                        \
+	OP(base)                                                               \
+	OP(tcache_list)                                                        \
+	OP(hpa_shard)                                                          \
+	OP(hpa_shard_grow)                                                     \
+	OP(hpa_sec)

 typedef enum {
 #define OP(mtx) arena_prof_mutex_##mtx,
 	MUTEX_PROF_ARENA_MUTEXES
 #undef OP
-	mutex_prof_num_arena_mutexes
+	    mutex_prof_num_arena_mutexes
 } mutex_prof_arena_ind_t;

 /*
 * The forth parameter is a boolean value that is true for derived rate counters
 * and false for real ones.
 */
-#define MUTEX_PROF_UINT64_COUNTERS					\
-    OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)					\
-    OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)				\
-    OP(num_wait, uint64_t, "n_waiting", false, num_wait)				\
-    OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)				\
-    OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)			\
-    OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)			\
-    OP(num_owner_switch, uint64_t, "n_owner_switch", false, num_owner_switch)		\
-    OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)	\
-    OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time)		\
-    OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)		\
-    OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)
+#define MUTEX_PROF_UINT64_COUNTERS                                             \
+	OP(num_ops, uint64_t, "n_lock_ops", false, num_ops)                    \
+	OP(num_ops_ps, uint64_t, "(#/sec)", true, num_ops)                     \
+	OP(num_wait, uint64_t, "n_waiting", false, num_wait)                   \
+	OP(num_wait_ps, uint64_t, "(#/sec)", true, num_wait)                   \
+	OP(num_spin_acq, uint64_t, "n_spin_acq", false, num_spin_acq)          \
+	OP(num_spin_acq_ps, uint64_t, "(#/sec)", true, num_spin_acq)           \
+	OP(num_owner_switch, uint64_t, "n_owner_switch", false,                \
+	    num_owner_switch)                                                  \
+	OP(num_owner_switch_ps, uint64_t, "(#/sec)", true, num_owner_switch)   \
+	OP(total_wait_time, uint64_t, "total_wait_ns", false, total_wait_time) \
+	OP(total_wait_time_ps, uint64_t, "(#/sec)", true, total_wait_time)     \
+	OP(max_wait_time, uint64_t, "max_wait_ns", false, max_wait_time)

-#define MUTEX_PROF_UINT32_COUNTERS					\
-    OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)
+#define MUTEX_PROF_UINT32_COUNTERS                                             \
+	OP(max_num_thds, uint32_t, "max_n_thds", false, max_num_thds)

-#define MUTEX_PROF_COUNTERS						\
-		MUTEX_PROF_UINT64_COUNTERS				\
-		MUTEX_PROF_UINT32_COUNTERS
+#define MUTEX_PROF_COUNTERS                                                    \
+	MUTEX_PROF_UINT64_COUNTERS                                             \
+	MUTEX_PROF_UINT32_COUNTERS

 #define OP(counter, type, human, derived, base_counter) mutex_counter_##counter,

-#define COUNTER_ENUM(counter_list, t)					\
-		typedef enum {						\
-			counter_list					\
-			mutex_prof_num_##t##_counters			\
-		} mutex_prof_##t##_counter_ind_t;
+#define COUNTER_ENUM(counter_list, t)                                          \
+	typedef enum {                                                         \
+		counter_list mutex_prof_num_##t##_counters                     \
+	} mutex_prof_##t##_counter_ind_t;

 COUNTER_ENUM(MUTEX_PROF_UINT64_COUNTERS, uint64_t)
 COUNTER_ENUM(MUTEX_PROF_UINT32_COUNTERS, uint32_t)
@ -88,17 +89,17 @@ typedef struct {
 	 * contention.  We update them once we have the lock.
 	 */
 	/* Total time (in nano seconds) spent waiting on this mutex. */
-	nstime_t		tot_wait_time;
+	nstime_t tot_wait_time;
 	/* Max time (in nano seconds) spent on a single lock operation. */
-	nstime_t		max_wait_time;
+	nstime_t max_wait_time;
 	/* # of times have to wait for this mutex (after spinning). */
-	uint64_t		n_wait_times;
+	uint64_t n_wait_times;
 	/* # of times acquired the mutex through local spinning. */
-	uint64_t		n_spin_acquired;
+	uint64_t n_spin_acquired;
 	/* Max # of threads waiting for the mutex at the same time. */
-	uint32_t		max_n_thds;
+	uint32_t max_n_thds;
 	/* Current # of threads waiting on the lock.  Atomic synced. */
-	atomic_u32_t		n_waiting_thds;
+	atomic_u32_t n_waiting_thds;

 	/*
 	 * Data touched on the fast path.  These are modified right after we
@ -107,11 +108,11 @@ typedef struct {
 	 * cacheline.
 	 */
 	/* # of times the mutex holder is different than the previous one. */
-	uint64_t		n_owner_switches;
+	uint64_t n_owner_switches;
 	/* Previous mutex holder, to facilitate n_owner_switches. */
-	tsdn_t			*prev_owner;
+	tsdn_t *prev_owner;
 	/* # of lock() operations in total. */
-	uint64_t		n_lock_ops;
+	uint64_t n_lock_ops;
 } mutex_prof_data_t;

 #endif /* JEMALLOC_INTERNAL_MUTEX_PROF_H */
--- a/include/jemalloc/internal/nstime.h
+++ b/include/jemalloc/internal/nstime.h
@ -1,14 +1,19 @@
 #ifndef JEMALLOC_INTERNAL_NSTIME_H
 #define JEMALLOC_INTERNAL_NSTIME_H

+#include "jemalloc/internal/jemalloc_preamble.h"
+#include "jemalloc/internal/assert.h"
+
 /* Maximum supported number of seconds (~584 years). */
 #define NSTIME_SEC_MAX KQU(18446744072)

 #define NSTIME_MAGIC ((uint32_t)0xb8a9ce37)
 #ifdef JEMALLOC_DEBUG
-#  define NSTIME_ZERO_INITIALIZER {0, NSTIME_MAGIC}
+#	define NSTIME_ZERO_INITIALIZER                                        \
+		{ 0, NSTIME_MAGIC }
 #else
-#  define NSTIME_ZERO_INITIALIZER {0}
+#	define NSTIME_ZERO_INITIALIZER                                        \
+		{ 0 }
 #endif

 typedef struct {
@ -20,43 +25,43 @@ typedef struct {

 static const nstime_t nstime_zero = NSTIME_ZERO_INITIALIZER;

-void nstime_init(nstime_t *time, uint64_t ns);
-void nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec);
+void     nstime_init(nstime_t *time, uint64_t ns);
+void     nstime_init2(nstime_t *time, uint64_t sec, uint64_t nsec);
 uint64_t nstime_ns(const nstime_t *time);
+uint64_t nstime_ms(const nstime_t *time);
 uint64_t nstime_sec(const nstime_t *time);
-uint64_t nstime_msec(const nstime_t *time);
 uint64_t nstime_nsec(const nstime_t *time);
-void nstime_copy(nstime_t *time, const nstime_t *source);
-int nstime_compare(const nstime_t *a, const nstime_t *b);
-void nstime_add(nstime_t *time, const nstime_t *addend);
-void nstime_iadd(nstime_t *time, uint64_t addend);
-void nstime_subtract(nstime_t *time, const nstime_t *subtrahend);
-void nstime_isubtract(nstime_t *time, uint64_t subtrahend);
-void nstime_imultiply(nstime_t *time, uint64_t multiplier);
-void nstime_idivide(nstime_t *time, uint64_t divisor);
+void     nstime_copy(nstime_t *time, const nstime_t *source);
+int      nstime_compare(const nstime_t *a, const nstime_t *b);
+void     nstime_add(nstime_t *time, const nstime_t *addend);
+void     nstime_iadd(nstime_t *time, uint64_t addend);
+void     nstime_subtract(nstime_t *time, const nstime_t *subtrahend);
+void     nstime_isubtract(nstime_t *time, uint64_t subtrahend);
+void     nstime_imultiply(nstime_t *time, uint64_t multiplier);
+void     nstime_idivide(nstime_t *time, uint64_t divisor);
 uint64_t nstime_divide(const nstime_t *time, const nstime_t *divisor);
+uint64_t nstime_ns_between(const nstime_t *earlier, const nstime_t *later);
+uint64_t nstime_ms_between(const nstime_t *earlier, const nstime_t *later);
 uint64_t nstime_ns_since(const nstime_t *past);
+uint64_t nstime_ms_since(const nstime_t *past);

-typedef bool (nstime_monotonic_t)(void);
+typedef bool(nstime_monotonic_t)(void);
 extern nstime_monotonic_t *JET_MUTABLE nstime_monotonic;

-typedef void (nstime_update_t)(nstime_t *);
+typedef void(nstime_update_t)(nstime_t *);
 extern nstime_update_t *JET_MUTABLE nstime_update;

-typedef void (nstime_prof_update_t)(nstime_t *);
+typedef void(nstime_prof_update_t)(nstime_t *);
 extern nstime_prof_update_t *JET_MUTABLE nstime_prof_update;

 void nstime_init_update(nstime_t *time);
 void nstime_prof_init_update(nstime_t *time);

-enum prof_time_res_e {
-	prof_time_res_default = 0,
-	prof_time_res_high = 1
-};
+enum prof_time_res_e { prof_time_res_default = 0, prof_time_res_high = 1 };
 typedef enum prof_time_res_e prof_time_res_t;

-extern prof_time_res_t opt_prof_time_res;
-extern const char *prof_time_res_mode_names[];
+extern prof_time_res_t   opt_prof_time_res;
+extern const char *const prof_time_res_mode_names[];

 JEMALLOC_ALWAYS_INLINE void
 nstime_init_zero(nstime_t *time) {
@ -64,7 +69,7 @@ nstime_init_zero(nstime_t *time) {
 }

 JEMALLOC_ALWAYS_INLINE bool
-nstime_equals_zero(nstime_t *time) {
+nstime_equals_zero(const nstime_t *time) {
 	int diff = nstime_compare(time, &nstime_zero);
 	assert(diff >= 0);
 	return diff == 0;
--- a/Show more
+++ b/Show more