From a95018ee819abf897562d9d1f3bc31d4dd725a8d Mon Sep 17 00:00:00 2001
From: Daniel Micay <danielmicay@gmail.com>
Date: Sat, 4 Oct 2014 01:39:32 -0400
Subject: [PATCH] Attempt to expand huge allocations in-place.

This adds support for expanding huge allocations in-place by requesting
memory at a specific address from the chunk allocator.

It's currently only implemented for the chunk recycling path, although
in theory it could also be done by optimistically allocating new chunks.
On Linux, it could attempt an in-place mremap. However, that won't work
in practice since the heap is grown downwards and memory is not unmapped
(in a normal build, at least).

Repeated vector reallocation micro-benchmark:

    #include <string.h>
    #include <stdlib.h>

    int main(void) {
        for (size_t i = 0; i < 100; i++) {
            void *ptr = NULL;
            size_t old_size = 0;
            for (size_t size = 4; size < (1 << 30); size *= 2) {
                ptr = realloc(ptr, size);
                if (!ptr) return 1;
                memset(ptr + old_size, 0xff, size - old_size);
                old_size = size;
            }
            free(ptr);
        }
    }

The glibc allocator fails to do any in-place reallocations on this
benchmark once it passes the M_MMAP_THRESHOLD (default 128k) but it
elides the cost of copies via mremap, which is currently not something
that jemalloc can use.

With this improvement, jemalloc still fails to do any in-place huge
reallocations for the first outer loop, but then succeeds 100% of the
time for the remaining 99 iterations. The time spent doing allocations
and copies drops down to under 5%, with nearly all of it spent doing
purging + faulting (when huge pages are disabled) and the array memset.

An improved mremap API (MREMAP_RETAIN - #138) would be far more general
but this is a portable optimization and would still be useful on Linux
for xallocx.

Numbers with transparent huge pages enabled:

glibc (copies elided via MREMAP_MAYMOVE): 8.471s

jemalloc: 17.816s
jemalloc + no-op madvise: 13.236s

jemalloc + this commit: 6.787s
jemalloc + this commit + no-op madvise: 6.144s

Numbers with transparent huge pages disabled:

glibc (copies elided via MREMAP_MAYMOVE): 15.403s

jemalloc: 39.456s
jemalloc + no-op madvise: 12.768s

jemalloc + this commit: 15.534s
jemalloc + this commit + no-op madvise: 6.354s

Closes #137
---
 doc/jemalloc.xml.in                           |  7 +-
 include/jemalloc/internal/arena.h             |  4 +-
 include/jemalloc/internal/chunk.h             |  8 +-
 include/jemalloc/internal/huge.h              |  2 +-
 .../jemalloc/internal/jemalloc_internal.h.in  |  2 +-
 include/jemalloc/jemalloc_typedefs.h.in       |  2 +-
 src/arena.c                                   |  8 +-
 src/chunk.c                                   | 47 +++++++-----
 src/huge.c                                    | 74 ++++++++++++++++++-
 test/integration/chunk.c                      |  5 +-
 10 files changed, 118 insertions(+), 41 deletions(-)
diff --git a/doc/jemalloc.xml.in b/doc/jemalloc.xml.in
index fcbb4722..f9d464ce 100644
--- a/doc/jemalloc.xml.in
+++ b/doc/jemalloc.xml.in
@@ -1351,6 +1351,7 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         function that knows how to deallocate the chunks.
         <funcprototype>
           <funcdef>typedef void *<function>(chunk_alloc_t)</function></funcdef>
+          <paramdef>void *<parameter>chunk</parameter></paramdef>
           <paramdef>size_t <parameter>size</parameter></paramdef>
           <paramdef>size_t <parameter>alignment</parameter></paramdef>
           <paramdef>bool *<parameter>zero</parameter></paramdef>
@@ -1367,8 +1368,10 @@ malloc_conf = "xmalloc:true";]]></programlisting>
         <parameter>size</parameter> parameter is always a multiple of the chunk
         size.  The <parameter>alignment</parameter> parameter is always a power
         of two at least as large as the chunk size.  Zeroing is mandatory if
-        <parameter>*zero</parameter> is true upon function
-        entry.</para>
+        <parameter>*zero</parameter> is true upon function entry.  If
+        <parameter>chunk</parameter> is not <constant>NULL</constant>, the
+        returned pointer must be <parameter>chunk</parameter> or
+        <constant>NULL</constant> if it could not be allocated.</para>
 
         <para>Note that replacing the default chunk allocation function makes
         the arena's <link
diff --git a/include/jemalloc/internal/arena.h b/include/jemalloc/internal/arena.h
index 2e9920ce..1f985723 100644
--- a/include/jemalloc/internal/arena.h
+++ b/include/jemalloc/internal/arena.h
@@ -343,8 +343,8 @@ extern arena_bin_info_t	arena_bin_info[NBINS];
 /* Number of large size classes. */
 #define			nlclasses (chunk_npages - map_bias)
 
-void	*arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
-    bool *zero);
+void	*arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero);
 void	arena_chunk_dalloc_huge(arena_t *arena, void *chunk, size_t size);
 void	arena_purge_all(arena_t *arena);
 void	arena_tcache_fill_small(arena_t *arena, tcache_bin_t *tbin,
diff --git a/include/jemalloc/internal/chunk.h b/include/jemalloc/internal/chunk.h
index 27aa0adf..2e68a020 100644
--- a/include/jemalloc/internal/chunk.h
+++ b/include/jemalloc/internal/chunk.h
@@ -46,10 +46,10 @@ extern size_t		arena_maxclass; /* Max size class for arenas. */
 
 void	*chunk_alloc_base(size_t size);
 void	*chunk_alloc_arena(chunk_alloc_t *chunk_alloc,
-    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, size_t size,
-    size_t alignment, bool *zero);
-void	*chunk_alloc_default(size_t size, size_t alignment, bool *zero,
-    unsigned arena_ind);
+    chunk_dalloc_t *chunk_dalloc, unsigned arena_ind, void *new_addr,
+    size_t size, size_t alignment, bool *zero);
+void	*chunk_alloc_default(void *new_addr, size_t size, size_t alignment,
+    bool *zero, unsigned arena_ind);
 void	chunk_unmap(void *chunk, size_t size);
 bool	chunk_dalloc_default(void *chunk, size_t size, unsigned arena_ind);
 bool	chunk_boot(void);
diff --git a/include/jemalloc/internal/huge.h b/include/jemalloc/internal/huge.h
index b061e15b..00d8c09d 100644
--- a/include/jemalloc/internal/huge.h
+++ b/include/jemalloc/internal/huge.h
@@ -13,7 +13,7 @@ void	*huge_malloc(tsd_t *tsd, arena_t *arena, size_t size, bool zero);
 void	*huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
     bool zero);
 bool	huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size,
-    size_t extra);
+    size_t extra, bool zero);
 void	*huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize,
     size_t size, size_t extra, size_t alignment, bool zero,
     bool try_tcache_dalloc);
diff --git a/include/jemalloc/internal/jemalloc_internal.h.in b/include/jemalloc/internal/jemalloc_internal.h.in
index ed25172f..a169221b 100644
--- a/include/jemalloc/internal/jemalloc_internal.h.in
+++ b/include/jemalloc/internal/jemalloc_internal.h.in
@@ -902,7 +902,7 @@ ixalloc(void *ptr, size_t size, size_t extra, size_t alignment, bool zero)
 	if (size <= arena_maxclass)
 		return (arena_ralloc_no_move(ptr, oldsize, size, extra, zero));
 	else
-		return (huge_ralloc_no_move(ptr, oldsize, size, extra));
+		return (huge_ralloc_no_move(ptr, oldsize, size, extra, zero));
 }
 #endif
 
diff --git a/include/jemalloc/jemalloc_typedefs.h.in b/include/jemalloc/jemalloc_typedefs.h.in
index 47e57ca7..8092f1b1 100644
--- a/include/jemalloc/jemalloc_typedefs.h.in
+++ b/include/jemalloc/jemalloc_typedefs.h.in
@@ -1,2 +1,2 @@
-typedef void *(chunk_alloc_t)(size_t, size_t, bool *, unsigned);
+typedef void *(chunk_alloc_t)(void *, size_t, size_t, bool *, unsigned);
 typedef bool (chunk_dalloc_t)(void *, size_t, unsigned);
diff --git a/src/arena.c b/src/arena.c
index c223946a..b7300a92 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -450,7 +450,7 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 	chunk_dalloc = arena->chunk_dalloc;
 	malloc_mutex_unlock(&arena->lock);
 	chunk = (arena_chunk_t *)chunk_alloc_arena(chunk_alloc, chunk_dalloc,
-	    arena->ind, size, alignment, zero);
+	    arena->ind, NULL, size, alignment, zero);
 	malloc_mutex_lock(&arena->lock);
 	if (config_stats && chunk != NULL)
 		arena->stats.mapped += chunksize;
@@ -459,8 +459,8 @@ arena_chunk_alloc_internal(arena_t *arena, size_t size, size_t alignment,
 }
 
 void *
-arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
-    bool *zero)
+arena_chunk_alloc_huge(arena_t *arena, void *new_addr, size_t size,
+    size_t alignment, bool *zero)
 {
 	void *ret;
 	chunk_alloc_t *chunk_alloc;
@@ -480,7 +480,7 @@ arena_chunk_alloc_huge(arena_t *arena, size_t size, size_t alignment,
 	malloc_mutex_unlock(&arena->lock);
 
 	ret = chunk_alloc_arena(chunk_alloc, chunk_dalloc, arena->ind,
-	    size, alignment, zero);
+	    new_addr, size, alignment, zero);
 	if (config_stats) {
 		if (ret != NULL)
 			stats_cactive_add(size);
diff --git a/src/chunk.c b/src/chunk.c
index cde8606e..32b8b3a6 100644
--- a/src/chunk.c
+++ b/src/chunk.c
@@ -42,8 +42,8 @@ static void	chunk_dalloc_core(void *chunk, size_t size);
 /******************************************************************************/
 
 static void *
-chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
-    size_t alignment, bool base, bool *zero)
+chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad,
+    void *new_addr, size_t size, size_t alignment, bool base, bool *zero)
 {
 	void *ret;
 	extent_node_t *node;
@@ -65,11 +65,11 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
 	/* Beware size_t wrap-around. */
 	if (alloc_size < size)
 		return (NULL);
-	key.addr = NULL;
+	key.addr = new_addr;
 	key.size = alloc_size;
 	malloc_mutex_lock(&chunks_mtx);
 	node = extent_tree_szad_nsearch(chunks_szad, &key);
-	if (node == NULL) {
+	if (node == NULL || (new_addr && node->addr != new_addr)) {
 		malloc_mutex_unlock(&chunks_mtx);
 		return (NULL);
 	}
@@ -142,8 +142,8 @@ chunk_recycle(extent_tree_t *chunks_szad, extent_tree_t *chunks_ad, size_t size,
  * them if they are returned.
  */
 static void *
-chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
-    dss_prec_t dss_prec)
+chunk_alloc_core(void *new_addr, size_t size, size_t alignment, bool base,
+    bool *zero, dss_prec_t dss_prec)
 {
 	void *ret;
 
@@ -154,24 +154,30 @@ chunk_alloc_core(size_t size, size_t alignment, bool base, bool *zero,
 
 	/* "primary" dss. */
 	if (have_dss && dss_prec == dss_prec_primary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		/* requesting an address only implemented for recycle */
+		if (new_addr == NULL
+		    && (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
 			return (ret);
 	}
 	/* mmap. */
-	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, size,
-	    alignment, base, zero)) != NULL)
+	if ((ret = chunk_recycle(&chunks_szad_mmap, &chunks_ad_mmap, new_addr,
+	    size, alignment, base, zero)) != NULL)
 		return (ret);
-	if ((ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
+	/* requesting an address only implemented for recycle */
+	if (new_addr == NULL &&
+	    (ret = chunk_alloc_mmap(size, alignment, zero)) != NULL)
 		return (ret);
 	/* "secondary" dss. */
 	if (have_dss && dss_prec == dss_prec_secondary) {
-		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss, size,
-		    alignment, base, zero)) != NULL)
+		if ((ret = chunk_recycle(&chunks_szad_dss, &chunks_ad_dss,
+		    new_addr, size, alignment, base, zero)) != NULL)
 			return (ret);
-		if ((ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
+		/* requesting an address only implemented for recycle */
+		if (new_addr == NULL &&
+		    (ret = chunk_alloc_dss(size, alignment, zero)) != NULL)
 			return (ret);
 	}
 
@@ -219,7 +225,7 @@ chunk_alloc_base(size_t size)
 	bool zero;
 
 	zero = false;
-	ret = chunk_alloc_core(size, chunksize, true, &zero,
+	ret = chunk_alloc_core(NULL, size, chunksize, true, &zero,
 	    chunk_dss_prec_get());
 	if (ret == NULL)
 		return (NULL);
@@ -232,11 +238,12 @@ chunk_alloc_base(size_t size)
 
 void *
 chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
-    unsigned arena_ind, size_t size, size_t alignment, bool *zero)
+    unsigned arena_ind, void *new_addr, size_t size, size_t alignment,
+    bool *zero)
 {
 	void *ret;
 
-	ret = chunk_alloc(size, alignment, zero, arena_ind);
+	ret = chunk_alloc(new_addr, size, alignment, zero, arena_ind);
 	if (ret != NULL && chunk_register(ret, size, false)) {
 		chunk_dalloc(ret, size, arena_ind);
 		ret = NULL;
@@ -247,11 +254,11 @@ chunk_alloc_arena(chunk_alloc_t *chunk_alloc, chunk_dalloc_t *chunk_dalloc,
 
 /* Default arena chunk allocation routine in the absence of user override. */
 void *
-chunk_alloc_default(size_t size, size_t alignment, bool *zero,
+chunk_alloc_default(void *new_addr, size_t size, size_t alignment, bool *zero,
     unsigned arena_ind)
 {
 
-	return (chunk_alloc_core(size, alignment, false, zero,
+	return (chunk_alloc_core(new_addr, size, alignment, false, zero,
 	    arenas[arena_ind]->dss_prec));
 }
 
diff --git a/src/huge.c b/src/huge.c
index 2f059b4d..6bdc0767 100644
--- a/src/huge.c
+++ b/src/huge.c
@@ -47,7 +47,7 @@ huge_palloc(tsd_t *tsd, arena_t *arena, size_t size, size_t alignment,
 	 */
 	is_zeroed = zero;
 	arena = choose_arena(tsd, arena);
-	ret = arena_chunk_alloc_huge(arena, csize, alignment, &is_zeroed);
+	ret = arena_chunk_alloc_huge(arena, NULL, csize, alignment, &is_zeroed);
 	if (ret == NULL) {
 		base_node_dalloc(node);
 		return (NULL);
@@ -95,8 +95,66 @@ huge_dalloc_junk(void *ptr, size_t usize)
 huge_dalloc_junk_t *huge_dalloc_junk = JEMALLOC_N(huge_dalloc_junk_impl);
 #endif
 
+static bool
+huge_ralloc_no_move_expand(void *ptr, size_t oldsize, size_t size, bool zero) {
+	size_t csize;
+	void *expand_addr;
+	size_t expand_size;
+	extent_node_t *node, key;
+	arena_t *arena;
+	bool is_zeroed;
+	void *ret;
+
+	csize = CHUNK_CEILING(size);
+	if (csize == 0) {
+		/* size is large enough to cause size_t wrap-around. */
+		return (true);
+	}
+
+	expand_addr = ptr + oldsize;
+	expand_size = csize - oldsize;
+
+	malloc_mutex_lock(&huge_mtx);
+
+	key.addr = ptr;
+	node = extent_tree_ad_search(&huge, &key);
+	assert(node != NULL);
+	assert(node->addr == ptr);
+
+	/* Find the current arena. */
+	arena = node->arena;
+
+	malloc_mutex_unlock(&huge_mtx);
+
+	/*
+	 * Copy zero into is_zeroed and pass the copy to chunk_alloc(), so that
+	 * it is possible to make correct junk/zero fill decisions below.
+	 */
+	is_zeroed = zero;
+	ret = arena_chunk_alloc_huge(arena, expand_addr, expand_size, chunksize,
+				     &is_zeroed);
+	if (ret == NULL)
+		return (true);
+
+	assert(ret == expand_addr);
+
+	malloc_mutex_lock(&huge_mtx);
+	/* Update the size of the huge allocation. */
+	node->size = csize;
+	malloc_mutex_unlock(&huge_mtx);
+
+	if (config_fill && !zero) {
+		if (unlikely(opt_junk))
+			memset(expand_addr, 0xa5, expand_size);
+		else if (unlikely(opt_zero) && !is_zeroed)
+			memset(expand_addr, 0, expand_size);
+	}
+	return (false);
+}
+
 bool
-huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
+huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra,
+    bool zero)
 {
 
 	/* Both allocations must be huge to avoid a move. */
@@ -145,7 +203,15 @@ huge_ralloc_no_move(void *ptr, size_t oldsize, size_t size, size_t extra)
 		return (false);
 	}
 
-	return (true);
+	/* Attempt to expand the allocation in-place. */
+	if (huge_ralloc_no_move_expand(ptr, oldsize, size + extra, zero)) {
+		if (extra == 0)
+			return (true);
+
+		/* Try again, this time without extra. */
+		return (huge_ralloc_no_move_expand(ptr, oldsize, size, zero));
+	}
+	return (false);
 }
 
 void *
@@ -156,7 +222,7 @@ huge_ralloc(tsd_t *tsd, arena_t *arena, void *ptr, size_t oldsize, size_t size,
 	size_t copysize;
 
 	/* Try to avoid moving the allocation. */
-	if (!huge_ralloc_no_move(ptr, oldsize, size, extra))
+	if (!huge_ralloc_no_move(ptr, oldsize, size, extra, zero))
 		return (ptr);
 
 	/*
diff --git a/test/integration/chunk.c b/test/integration/chunk.c
index 28537098..89938504 100644
--- a/test/integration/chunk.c
+++ b/test/integration/chunk.c
@@ -11,10 +11,11 @@ chunk_dalloc(void *chunk, size_t size, unsigned arena_ind)
 }
 
 void *
-chunk_alloc(size_t size, size_t alignment, bool *zero, unsigned arena_ind)
+chunk_alloc(void *new_addr, size_t size, size_t alignment, bool *zero,
+    unsigned arena_ind)
 {
 
-	return (old_alloc(size, alignment, zero, arena_ind));
+	return (old_alloc(new_addr, size, alignment, zero, arena_ind));
 }
 
 TEST_BEGIN(test_chunk)