jemalloc/src/hpa.c

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"

#include "jemalloc/internal/hpa.h"

#include "jemalloc/internal/flat_bitmap.h"
#include "jemalloc/internal/witness.h"

#define HPA_EDEN_SIZE (128 * HUGEPAGE)

static edata_t *hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
    size_t alignment, bool zero);
static bool hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size, bool zero);
static bool hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size);
static void hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata);

bool
hpa_supported() {
#ifdef _WIN32
	/*
	 * At least until the API and implementation is somewhat settled, we
	 * don't want to try to debug the VM subsystem on the hardest-to-test
	 * platform.
	 */
	return false;
#endif
	if (!pages_can_hugify) {
		return false;
	}
	/*
	 * We fundamentally rely on a address-space-hungry growth strategy for
	 * hugepages.
	 */
	if (LG_SIZEOF_PTR != 3) {
		return false;
	}
	/*
	 * If we couldn't detect the value of HUGEPAGE, HUGEPAGE_PAGES becomes
	 * this sentinel value -- see the comment in pages.h.
	 */
	if (HUGEPAGE_PAGES == 1) {
		return false;
	}
	return true;
}

bool
hpa_shard_init(hpa_shard_t *shard, emap_t *emap, base_t *base,
    edata_cache_t *edata_cache, unsigned ind, size_t alloc_max) {
	/* malloc_conf processing should have filtered out these cases. */
	assert(hpa_supported());
	bool err;
	err = malloc_mutex_init(&shard->grow_mtx, "hpa_shard_grow",
	    WITNESS_RANK_HPA_SHARD_GROW, malloc_mutex_rank_exclusive);
	if (err) {
		return true;
	}
	err = malloc_mutex_init(&shard->mtx, "hpa_shard",
	    WITNESS_RANK_HPA_SHARD, malloc_mutex_rank_exclusive);
	if (err) {
		return true;
	}

	assert(edata_cache != NULL);
	shard->base = base;
	edata_cache_small_init(&shard->ecs, edata_cache);
	psset_init(&shard->psset);
	shard->alloc_max = alloc_max;
	hpdata_list_init(&shard->unused_slabs);
	shard->age_counter = 0;
	shard->eden = NULL;
	shard->eden_len = 0;
	shard->ind = ind;
	shard->emap = emap;

	/*
	 * Fill these in last, so that if an hpa_shard gets used despite
	 * initialization failing, we'll at least crash instead of just
	 * operating on corrupted data.
	 */
	shard->pai.alloc = &hpa_alloc;
	shard->pai.expand = &hpa_expand;
	shard->pai.shrink = &hpa_shrink;
	shard->pai.dalloc = &hpa_dalloc;

	return false;
}

/*
 * Note that the stats functions here follow the usual stats naming conventions;
 * "merge" obtains the stats from some live object of instance, while "accum"
 * only combines the stats from one stats objet to another.  Hence the lack of
 * locking here.
 */
void
hpa_shard_stats_accum(hpa_shard_stats_t *dst, hpa_shard_stats_t *src) {
	psset_stats_accum(&dst->psset_stats, &src->psset_stats);
}

void
hpa_shard_stats_merge(tsdn_t *tsdn, hpa_shard_t *shard,
    hpa_shard_stats_t *dst) {
	malloc_mutex_lock(tsdn, &shard->mtx);
	psset_stats_accum(&dst->psset_stats, &shard->psset.stats);
	malloc_mutex_unlock(tsdn, &shard->mtx);
}

static hpdata_t *
hpa_alloc_ps(tsdn_t *tsdn, hpa_shard_t *shard) {
	return (hpdata_t *)base_alloc(tsdn, shard->base, sizeof(hpdata_t),
	    CACHELINE);
}

static bool
hpa_should_hugify(hpa_shard_t *shard, hpdata_t *ps) {
	/*
	 * For now, just use a static check; hugify a page if it's <= 5%
	 * inactive.  Eventually, this should be a malloc conf option.
	 */
	return !hpdata_huge_get(ps)
	    && hpdata_nfree_get(ps) < (HUGEPAGE / PAGE) * 5 / 100;
}

/* Returns true on error. */
static void
hpa_hugify(hpdata_t *ps) {
	assert(hpdata_huge_get(ps));
	bool err = pages_huge(hpdata_addr_get(ps), HUGEPAGE);
	/*
	 * Eat the error; even if the hugeification failed, it's still safe to
	 * pretend it didn't (and would require extraordinary measures to
	 * unhugify).
	 */
	(void)err;
}

static void
hpa_dehugify(hpdata_t *ps) {
	/* Purge, then dehugify while unbacked. */
	pages_purge_forced(hpdata_addr_get(ps), HUGEPAGE);
	pages_nohuge(hpdata_addr_get(ps), HUGEPAGE);
	hpdata_huge_set(ps, false);
}

static hpdata_t *
hpa_grow(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_assert_owner(tsdn, &shard->grow_mtx);
	hpdata_t *ps = NULL;

	/* Is there address space waiting for reuse? */
	malloc_mutex_assert_owner(tsdn, &shard->grow_mtx);
	ps = hpdata_list_first(&shard->unused_slabs);
	if (ps != NULL) {
		hpdata_list_remove(&shard->unused_slabs, ps);
		hpdata_age_set(ps, shard->age_counter++);
		return ps;
	}

	/* Is eden a perfect fit? */
	if (shard->eden != NULL && shard->eden_len == HUGEPAGE) {
		ps = hpa_alloc_ps(tsdn, shard);
		if (ps == NULL) {
			return NULL;
		}
		hpdata_init(ps, shard->eden, shard->age_counter++);
		shard->eden = NULL;
		shard->eden_len = 0;
		return ps;
	}

	/*
	 * We're about to try to allocate from eden by splitting.  If eden is
	 * NULL, we have to allocate it too.  Otherwise, we just have to
	 * allocate an edata_t for the new psset.
	 */
	if (shard->eden == NULL) {
		/*
		 * During development, we're primarily concerned with systems
		 * with overcommit.  Eventually, we should be more careful here.
		 */
		bool commit = true;
		/* Allocate address space, bailing if we fail. */
		void *new_eden = pages_map(NULL, HPA_EDEN_SIZE, HUGEPAGE,
		    &commit);
		if (new_eden == NULL) {
			return NULL;
		}
		ps = hpa_alloc_ps(tsdn, shard);
		if (ps == NULL) {
			pages_unmap(new_eden, HPA_EDEN_SIZE);
			return NULL;
		}
		shard->eden = new_eden;
		shard->eden_len = HPA_EDEN_SIZE;
	} else {
		/* Eden is already nonempty; only need an edata for ps. */
		ps = hpa_alloc_ps(tsdn, shard);
		if (ps == NULL) {
			return NULL;
		}
	}
	assert(ps != NULL);
	assert(shard->eden != NULL);
	assert(shard->eden_len > HUGEPAGE);
	assert(shard->eden_len % HUGEPAGE == 0);
	assert(HUGEPAGE_ADDR2BASE(shard->eden) == shard->eden);

	hpdata_init(ps, shard->eden, shard->age_counter++);

	char *eden_char = (char *)shard->eden;
	eden_char += HUGEPAGE;
	shard->eden = (void *)eden_char;
	shard->eden_len -= HUGEPAGE;

	return ps;
}

/*
 * The psset does not hold empty slabs.  Upon becoming empty, then, we need to
 * put them somewhere.  We take this as an opportunity to purge, and retain
 * their address space in a list outside the psset.
 */
static void
hpa_handle_ps_eviction(tsdn_t *tsdn, hpa_shard_t *shard, hpdata_t *ps) {
	/*
	 * We do relatively expensive system calls.  The ps was evicted, so no
	 * one should touch it while we're also touching it.
	 */
	malloc_mutex_assert_not_owner(tsdn, &shard->mtx);
	malloc_mutex_assert_not_owner(tsdn, &shard->grow_mtx);

	/*
	 * We do this unconditionally, even for pages which were not originally
	 * hugeified; it has the same effect.
	 */
	hpa_dehugify(ps);

	malloc_mutex_lock(tsdn, &shard->grow_mtx);
	hpdata_list_prepend(&shard->unused_slabs, ps);
	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
}

static edata_t *
hpa_try_alloc_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size, bool *oom) {
	bool err;
	malloc_mutex_lock(tsdn, &shard->mtx);
	edata_t *edata = edata_cache_small_get(tsdn, &shard->ecs);
	*oom = false;
	if (edata == NULL) {
		malloc_mutex_unlock(tsdn, &shard->mtx);
		*oom = true;
		return NULL;
	}
	assert(edata_arena_ind_get(edata) == shard->ind);

	err = psset_alloc_reuse(&shard->psset, edata, size);
	if (err) {
		edata_cache_small_put(tsdn, &shard->ecs, edata);
		malloc_mutex_unlock(tsdn, &shard->mtx);
		return NULL;
	}
	/*
	 * This could theoretically be moved outside of the critical section,
	 * but that introduces the potential for a race.  Without the lock, the
	 * (initially nonempty, since this is the reuse pathway) pageslab we
	 * allocated out of could become otherwise empty while the lock is
	 * dropped.  This would force us to deal with a pageslab eviction down
	 * the error pathway, which is a pain.
	 */
	err = emap_register_boundary(tsdn, shard->emap, edata,
	    SC_NSIZES, /* slab */ false);
	if (err) {
		hpdata_t *ps = psset_dalloc(&shard->psset, edata);
		/*
		 * The pageslab was nonempty before we started; it
		 * should still be nonempty now, and so shouldn't get
		 * evicted.
		 */
		assert(ps == NULL);
		edata_cache_small_put(tsdn, &shard->ecs, edata);
		malloc_mutex_unlock(tsdn, &shard->mtx);
		*oom = true;
		return NULL;
	}

	hpdata_t *ps = edata_ps_get(edata);
	assert(ps != NULL);
	bool hugify = hpa_should_hugify(shard, ps);
	if (hugify) {
		/*
		 * Do the metadata modification while holding the lock; we'll
		 * actually change state with the lock dropped.
		 */
		psset_hugify(&shard->psset, ps);
	}
	malloc_mutex_unlock(tsdn, &shard->mtx);
	if (hugify) {
		/*
		 * Hugifying with the lock dropped is safe, even with
		 * concurrent modifications to the ps.  This relies on
		 * the fact that the current implementation will never
		 * dehugify a non-empty pageslab, and ps will never
		 * become empty before we return edata to the user to be
		 * freed.
		 *
		 * Note that holding the lock would prevent not just operations
		 * on this page slab, but also operations any other alloc/dalloc
		 * operations in this hpa shard.
		 */
		hpa_hugify(ps);
	}
	return edata;
}

static edata_t *
hpa_alloc_psset(tsdn_t *tsdn, hpa_shard_t *shard, size_t size) {
	assert(size <= shard->alloc_max);
	bool err;
	bool oom;
	edata_t *edata;

	edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom);
	if (edata != NULL) {
		return edata;
	}

	/* Nothing in the psset works; we have to grow it. */
	malloc_mutex_lock(tsdn, &shard->grow_mtx);
	/*
	 * Check for grow races; maybe some earlier thread expanded the psset
	 * in between when we dropped the main mutex and grabbed the grow mutex.
	 */
	edata = hpa_try_alloc_no_grow(tsdn, shard, size, &oom);
	if (edata != NULL || oom) {
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
		return edata;
	}

	/*
	 * Note that we don't hold shard->mtx here (while growing);
	 * deallocations (and allocations of smaller sizes) may still succeed
	 * while we're doing this potentially expensive system call.
	 */
	hpdata_t *grow_ps = hpa_grow(tsdn, shard);
	if (grow_ps == NULL) {
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
		return NULL;
	}

	/* We got the new edata; allocate from it. */
	malloc_mutex_lock(tsdn, &shard->mtx);
	edata = edata_cache_small_get(tsdn, &shard->ecs);
	if (edata == NULL) {
		malloc_mutex_unlock(tsdn, &shard->mtx);
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
		hpa_handle_ps_eviction(tsdn, shard, grow_ps);
		return NULL;
	}
	psset_alloc_new(&shard->psset, grow_ps, edata, size);
	err = emap_register_boundary(tsdn, shard->emap, edata,
	    SC_NSIZES, /* slab */ false);
	if (err) {
		hpdata_t *ps = psset_dalloc(&shard->psset, edata);
		/*
		 * The pageslab was empty except for the new allocation; it
		 * should get evicted.
		 */
		assert(ps == grow_ps);
		edata_cache_small_put(tsdn, &shard->ecs, edata);
		/*
		 * Technically the same as fallthrough at the time of this
		 * writing, but consistent with the error handling in the rest
		 * of the function.
		 */
		malloc_mutex_unlock(tsdn, &shard->mtx);
		malloc_mutex_unlock(tsdn, &shard->grow_mtx);
		hpa_handle_ps_eviction(tsdn, shard, ps);
		return NULL;
	}
	malloc_mutex_unlock(tsdn, &shard->mtx);
	malloc_mutex_unlock(tsdn, &shard->grow_mtx);
	return edata;
}

static hpa_shard_t *
hpa_from_pai(pai_t *self) {
	assert(self->alloc = &hpa_alloc);
	assert(self->expand = &hpa_expand);
	assert(self->shrink = &hpa_shrink);
	assert(self->dalloc = &hpa_dalloc);
	return (hpa_shard_t *)self;
}

static edata_t *
hpa_alloc(tsdn_t *tsdn, pai_t *self, size_t size,
    size_t alignment, bool zero) {
	assert((size & PAGE_MASK) == 0);
	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
	    WITNESS_RANK_CORE, 0);

	hpa_shard_t *shard = hpa_from_pai(self);
	/* We don't handle alignment or zeroing for now. */
	if (alignment > PAGE || zero) {
		return NULL;
	}
	if (size > shard->alloc_max) {
		return NULL;
	}

	edata_t *edata = hpa_alloc_psset(tsdn, shard, size);

	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
	    WITNESS_RANK_CORE, 0);

	if (edata != NULL) {
		emap_assert_mapped(tsdn, shard->emap, edata);
		assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
		assert(edata_state_get(edata) == extent_state_active);
		assert(edata_arena_ind_get(edata) == shard->ind);
		assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
		assert(!edata_slab_get(edata));
		assert(edata_committed_get(edata));
		assert(edata_base_get(edata) == edata_addr_get(edata));
		assert(edata_base_get(edata) != NULL);
	}
	return edata;
}

static bool
hpa_expand(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size, bool zero) {
	/* Expand not yet supported. */
	return true;
}

static bool
hpa_shrink(tsdn_t *tsdn, pai_t *self, edata_t *edata,
    size_t old_size, size_t new_size) {
	/* Shrink not yet supported. */
	return true;
}

static void
hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
	hpa_shard_t *shard = hpa_from_pai(self);

	edata_addr_set(edata, edata_base_get(edata));
	edata_zeroed_set(edata, false);

	assert(edata_pai_get(edata) == EXTENT_PAI_HPA);
	assert(edata_state_get(edata) == extent_state_active);
	assert(edata_arena_ind_get(edata) == shard->ind);
	assert(edata_szind_get_maybe_invalid(edata) == SC_NSIZES);
	assert(!edata_slab_get(edata));
	assert(edata_committed_get(edata));
	assert(edata_base_get(edata) != NULL);

	hpdata_t *ps = edata_ps_get(edata);
	/* Currently, all edatas come from pageslabs. */
	assert(ps != NULL);
	emap_deregister_boundary(tsdn, shard->emap, edata);
	malloc_mutex_lock(tsdn, &shard->mtx);
	/*
	 * Note that the shard mutex protects the edata hugeified field, too.
	 * Page slabs can move between pssets (and have their hugeified status
	 * change) in racy ways.
	 */
	hpdata_t *evicted_ps = psset_dalloc(&shard->psset, edata);
	/*
	 * If a pageslab became empty because of the dalloc, it better have been
	 * the one we expected.
	 */
	assert(evicted_ps == NULL || evicted_ps == ps);
	edata_cache_small_put(tsdn, &shard->ecs, edata);
	malloc_mutex_unlock(tsdn, &shard->mtx);
	if (evicted_ps != NULL) {
		hpa_handle_ps_eviction(tsdn, shard, evicted_ps);
	}
}

void
hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_lock(tsdn, &shard->mtx);
	edata_cache_small_disable(tsdn, &shard->ecs);
	malloc_mutex_unlock(tsdn, &shard->mtx);
}

static void
hpa_shard_assert_stats_empty(psset_bin_stats_t *bin_stats) {
	assert(bin_stats->npageslabs_huge == 0);
	assert(bin_stats->nactive_huge == 0);
	assert(bin_stats->ninactive_huge == 0);
	assert(bin_stats->npageslabs_nonhuge == 0);
	assert(bin_stats->nactive_nonhuge == 0);
	assert(bin_stats->ninactive_nonhuge == 0);
}

static void
hpa_assert_empty(tsdn_t *tsdn, hpa_shard_t *shard, psset_t *psset) {
	edata_t edata = {0};
	malloc_mutex_assert_owner(tsdn, &shard->mtx);
	bool psset_empty = psset_alloc_reuse(psset, &edata, PAGE);
	assert(psset_empty);
	hpa_shard_assert_stats_empty(&psset->stats.full_slabs);
	for (pszind_t i = 0; i < PSSET_NPSIZES; i++) {
		hpa_shard_assert_stats_empty(
		    &psset->stats.nonfull_slabs[i]);
	}
}

void
hpa_shard_destroy(tsdn_t *tsdn, hpa_shard_t *shard) {
	/*
	 * By the time we're here, the arena code should have dalloc'd all the
	 * active extents, which means we should have eventually evicted
	 * everything from the psset, so it shouldn't be able to serve even a
	 * 1-page allocation.
	 */
	if (config_debug) {
		malloc_mutex_lock(tsdn, &shard->mtx);
		hpa_assert_empty(tsdn, shard, &shard->psset);
		malloc_mutex_unlock(tsdn, &shard->mtx);
	}
	hpdata_t *ps;
	while ((ps = hpdata_list_first(&shard->unused_slabs)) != NULL) {
		hpdata_list_remove(&shard->unused_slabs, ps);
		pages_unmap(hpdata_addr_get(ps), HUGEPAGE);
	}
}

void
hpa_shard_prefork3(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_prefork(tsdn, &shard->grow_mtx);
}

void
hpa_shard_prefork4(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_prefork(tsdn, &shard->mtx);
}

void
hpa_shard_postfork_parent(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_postfork_parent(tsdn, &shard->grow_mtx);
	malloc_mutex_postfork_parent(tsdn, &shard->mtx);
}

void
hpa_shard_postfork_child(tsdn_t *tsdn, hpa_shard_t *shard) {
	malloc_mutex_postfork_child(tsdn, &shard->grow_mtx);
	malloc_mutex_postfork_child(tsdn, &shard->mtx);
}