jemalloc/src/sec.c

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"

#include "jemalloc/internal/sec.h"
#include "jemalloc/internal/jemalloc_probe.h"

static bool
sec_bin_init(sec_bin_t *bin) {
	bin->bytes_cur = 0;
	sec_bin_stats_init(&bin->stats);
	edata_list_active_init(&bin->freelist);
	bool err = malloc_mutex_init(&bin->mtx, "sec_bin", WITNESS_RANK_SEC_BIN,
	    malloc_mutex_rank_exclusive);
	if (err) {
		return true;
	}

	return false;
}

bool
sec_init(tsdn_t *tsdn, sec_t *sec, base_t *base, const sec_opts_t *opts) {
	sec->opts = *opts;
	if (opts->nshards == 0) {
		return false;
	}
	assert(opts->max_alloc >= PAGE);

	/*
	 * Same as tcache, sec do not cache allocs/dallocs larger than
	 * USIZE_GROW_SLOW_THRESHOLD because the usize above this increases
	 * by PAGE and the number of usizes is too large.
	 */
	assert(opts->max_alloc <= USIZE_GROW_SLOW_THRESHOLD);

	size_t   max_alloc = PAGE_FLOOR(opts->max_alloc);
	pszind_t npsizes = sz_psz2ind(max_alloc) + 1;

	size_t ntotal_bins = opts->nshards * (size_t)npsizes;
	size_t sz_bins = sizeof(sec_bin_t) * ntotal_bins;
	void  *dynalloc = base_alloc(tsdn, base, sz_bins, CACHELINE);
	if (dynalloc == NULL) {
		return true;
	}
	sec->bins = (sec_bin_t *)dynalloc;
	for (pszind_t j = 0; j < ntotal_bins; j++) {
		if (sec_bin_init(&sec->bins[j])) {
			return true;
		}
	}
	sec->npsizes = npsizes;

	return false;
}

static uint8_t
sec_shard_pick(tsdn_t *tsdn, sec_t *sec) {
	/*
	 * Eventually, we should implement affinity, tracking source shard using
	 * the edata_t's newly freed up fields.  For now, just randomly
	 * distribute across all shards.
	 */
	if (tsdn_null(tsdn)) {
		return 0;
	}
	tsd_t   *tsd = tsdn_tsd(tsdn);
	uint8_t *idxp = tsd_sec_shardp_get(tsd);
	if (*idxp == (uint8_t)-1) {
		/*
		 * First use; initialize using the trick from Daniel Lemire's
		 * "A fast alternative to the modulo reduction.  Use a 64 bit
		 * number to store 32 bits, since we'll deliberately overflow
		 * when we multiply by the number of shards.
		 */
		uint64_t rand32 = prng_lg_range_u64(
		    tsd_prng_statep_get(tsd), 32);
		uint32_t idx = (uint32_t)((rand32 * (uint64_t)sec->opts.nshards)
		    >> 32);
		assert(idx < (uint32_t)sec->opts.nshards);
		*idxp = (uint8_t)idx;
	}
	return *idxp;
}

static sec_bin_t *
sec_bin_pick(sec_t *sec, uint8_t shard, pszind_t pszind) {
	assert(shard < sec->opts.nshards);
	size_t ind = (size_t)shard * sec->npsizes + pszind;
	assert(ind < sec->npsizes * sec->opts.nshards);
	return &sec->bins[ind];
}

size_t
sec_calc_nallocs_for_size(sec_t *sec, size_t size) {
	size_t res = 1;

	if (sec_size_supported(sec, size)) {
		/*
		 * This attempts to fill up to 1/MAX_BYTES_DIV of the SEC.
		 * If we go much over that, we might cause purging.
		 * This is mainly an issue when max_bytes is small (256K)
		 * and size is large. For larger max_bytes, we will
		 * almost always end up with MAX_SEC_NALLOCS.
		 */
		res = sec->opts.max_bytes / size / MAX_BYTES_DIV;
		res = MAX(res, 1);
		res = MIN(res, MAX_SEC_NALLOCS);
	}

	/* post-conditions */
	assert(1 <= res);
	assert(res <= MAX_SEC_NALLOCS);

	return res;
}

static edata_t *
sec_bin_alloc_locked(tsdn_t *tsdn, sec_t *sec, sec_bin_t *bin, size_t size) {
	malloc_mutex_assert_owner(tsdn, &bin->mtx);

	edata_t *edata = edata_list_active_first(&bin->freelist);
	if (edata != NULL) {
		assert(!edata_list_active_empty(&bin->freelist));
		edata_list_active_remove(&bin->freelist, edata);
		size_t sz = edata_size_get(edata);
		assert(sz <= bin->bytes_cur && sz > 0);
		bin->bytes_cur -= sz;
		bin->stats.nhits++;
	}
	return edata;
}

static edata_t *
sec_multishard_trylock_alloc(
    tsdn_t *tsdn, sec_t *sec, size_t size, pszind_t pszind) {
	assert(sec->opts.nshards > 0);

	uint8_t    cur_shard = sec_shard_pick(tsdn, sec);
	sec_bin_t *bin;
	for (size_t i = 0; i < sec->opts.nshards; ++i) {
		bin = sec_bin_pick(sec, cur_shard, pszind);
		if (!malloc_mutex_trylock(tsdn, &bin->mtx)) {
			edata_t *edata = sec_bin_alloc_locked(
			    tsdn, sec, bin, size);
			malloc_mutex_unlock(tsdn, &bin->mtx);
			if (edata != NULL) {
				JE_USDT(sec_alloc, 5, sec, bin, edata, size,
				    /* frequent_reuse */ 1);
				return edata;
			}
		}
		cur_shard++;
		if (cur_shard == sec->opts.nshards) {
			cur_shard = 0;
		}
	}
	/*
	 * TODO: Benchmark whether it is worth blocking on all shards here before
	 * declaring a miss.  That could recover more remote-shard hits under
	 * contention, but it also changes the allocation latency policy.
	 */
	assert(cur_shard == sec_shard_pick(tsdn, sec));
	bin = sec_bin_pick(sec, cur_shard, pszind);
	malloc_mutex_lock(tsdn, &bin->mtx);
	edata_t *edata = sec_bin_alloc_locked(tsdn, sec, bin, size);
	if (edata == NULL) {
		/* Only now we know it is a miss. */
		bin->stats.nmisses++;
	}
	malloc_mutex_unlock(tsdn, &bin->mtx);
	JE_USDT(sec_alloc, 5, sec, bin, edata, size, /* frequent_reuse */ 1);
	return edata;
}

edata_t *
sec_alloc(tsdn_t *tsdn, sec_t *sec, size_t size) {
	if (!sec_size_supported(sec, size)) {
		return NULL;
	}
	assert((size & PAGE_MASK) == 0);
	pszind_t pszind = sz_psz2ind(size);
	assert(pszind < sec->npsizes);

	/*
	 * If there's only one shard, skip the trylock optimization and
	 * go straight to the blocking lock.
	 */
	if (sec->opts.nshards == 1) {
		sec_bin_t *bin = sec_bin_pick(sec, /* shard */ 0, pszind);
		malloc_mutex_lock(tsdn, &bin->mtx);
		edata_t *edata = sec_bin_alloc_locked(tsdn, sec, bin, size);
		if (edata == NULL) {
			bin->stats.nmisses++;
		}
		malloc_mutex_unlock(tsdn, &bin->mtx);
		JE_USDT(sec_alloc, 5, sec, bin, edata, size,
		    /* frequent_reuse */ 1);
		return edata;
	}
	return sec_multishard_trylock_alloc(tsdn, sec, size, pszind);
}

static void
sec_bin_dalloc_locked(tsdn_t *tsdn, sec_t *sec, sec_bin_t *bin, size_t size,
    edata_list_active_t *dalloc_list) {
	malloc_mutex_assert_owner(tsdn, &bin->mtx);

	bin->bytes_cur += size;
	edata_t *edata = edata_list_active_first(dalloc_list);
	assert(edata != NULL);
	edata_list_active_remove(dalloc_list, edata);
	JE_USDT(sec_dalloc, 3, sec, bin, edata);
	edata_list_active_prepend(&bin->freelist, edata);
	/* Single extent can be returned to SEC */
	assert(edata_list_active_empty(dalloc_list));

	if (bin->bytes_cur <= sec->opts.max_bytes) {
		bin->stats.ndalloc_noflush++;
		return;
	}
	bin->stats.ndalloc_flush++;
	/* we want to flush 1/4 of max_bytes */
	size_t bytes_target = sec->opts.max_bytes - (sec->opts.max_bytes >> 2);
	while (bin->bytes_cur > bytes_target
	    && !edata_list_active_empty(&bin->freelist)) {
		edata_t *cur = edata_list_active_last(&bin->freelist);
		size_t   sz = edata_size_get(cur);
		assert(sz <= bin->bytes_cur && sz > 0);
		bin->bytes_cur -= sz;
		edata_list_active_remove(&bin->freelist, cur);
		edata_list_active_append(dalloc_list, cur);
	}
}

static void
sec_multishard_trylock_dalloc(tsdn_t *tsdn, sec_t *sec, size_t size,
    pszind_t pszind, edata_list_active_t *dalloc_list) {
	assert(sec->opts.nshards > 0);

	/* Try to dalloc in this threads bin first */
	uint8_t cur_shard = sec_shard_pick(tsdn, sec);
	for (size_t i = 0; i < sec->opts.nshards; ++i) {
		sec_bin_t *bin = sec_bin_pick(sec, cur_shard, pszind);
		if (!malloc_mutex_trylock(tsdn, &bin->mtx)) {
			sec_bin_dalloc_locked(
			    tsdn, sec, bin, size, dalloc_list);
			malloc_mutex_unlock(tsdn, &bin->mtx);
			return;
		}
		cur_shard++;
		if (cur_shard == sec->opts.nshards) {
			cur_shard = 0;
		}
	}
	/* No bin had alloc or had the extent */
	assert(cur_shard == sec_shard_pick(tsdn, sec));
	sec_bin_t *bin = sec_bin_pick(sec, cur_shard, pszind);
	malloc_mutex_lock(tsdn, &bin->mtx);
	sec_bin_dalloc_locked(tsdn, sec, bin, size, dalloc_list);
	malloc_mutex_unlock(tsdn, &bin->mtx);
}

void
sec_dalloc(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *dalloc_list) {
	if (!sec_is_used(sec)) {
		return;
	}
	edata_t *edata = edata_list_active_first(dalloc_list);
	size_t   size = edata_size_get(edata);
	if (size > sec->opts.max_alloc) {
		return;
	}
	pszind_t pszind = sz_psz2ind(size);
	assert(pszind < sec->npsizes);

	/*
         * If there's only one shard, skip the trylock optimization and
	 * go straight to the blocking lock.
	 */
	if (sec->opts.nshards == 1) {
		sec_bin_t *bin = sec_bin_pick(sec, /* shard */ 0, pszind);
		malloc_mutex_lock(tsdn, &bin->mtx);
		sec_bin_dalloc_locked(tsdn, sec, bin, size, dalloc_list);
		malloc_mutex_unlock(tsdn, &bin->mtx);
		return;
	}
	sec_multishard_trylock_dalloc(tsdn, sec, size, pszind, dalloc_list);
}

void
sec_fill(tsdn_t *tsdn, sec_t *sec, size_t size, edata_list_active_t *result,
    size_t nallocs) {
	assert((size & PAGE_MASK) == 0);
	assert(sec->opts.nshards != 0 && size <= sec->opts.max_alloc);
	assert(nallocs > 0);

	pszind_t pszind = sz_psz2ind(size);
	assert(pszind < sec->npsizes);

	sec_bin_t *bin = sec_bin_pick(sec, sec_shard_pick(tsdn, sec), pszind);
	malloc_mutex_assert_not_owner(tsdn, &bin->mtx);
	malloc_mutex_lock(tsdn, &bin->mtx);
	size_t new_cached_bytes = nallocs * size;
	if (bin->bytes_cur + new_cached_bytes <= sec->opts.max_bytes) {
		assert(!edata_list_active_empty(result));
		edata_list_active_concat(&bin->freelist, result);
		bin->bytes_cur += new_cached_bytes;
	} else {
		/*
		 * Unlikely case of many threads filling at the same time and
		 * going above max.
		 */
		bin->stats.noverfills++;
		while (bin->bytes_cur + size <= sec->opts.max_bytes) {
			edata_t *edata = edata_list_active_first(result);
			if (edata == NULL) {
				break;
			}
			edata_list_active_remove(result, edata);
			assert(size == edata_size_get(edata));
			edata_list_active_append(&bin->freelist, edata);
			bin->bytes_cur += size;
		}
	}
	malloc_mutex_unlock(tsdn, &bin->mtx);
}

void
sec_flush(tsdn_t *tsdn, sec_t *sec, edata_list_active_t *to_flush) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_lock(tsdn, &bin->mtx);
		bin->bytes_cur = 0;
		edata_list_active_concat(to_flush, &bin->freelist);
		malloc_mutex_unlock(tsdn, &bin->mtx);
	}
}

void
sec_stats_merge(tsdn_t *tsdn, sec_t *sec, sec_stats_t *stats) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t sum = 0;
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_lock(tsdn, &bin->mtx);
		sum += bin->bytes_cur;
		sec_bin_stats_accum(&stats->total, &bin->stats);
		malloc_mutex_unlock(tsdn, &bin->mtx);
	}
	stats->bytes += sum;
}

void
sec_mutex_stats_read(
    tsdn_t *tsdn, sec_t *sec, mutex_prof_data_t *mutex_prof_data) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_lock(tsdn, &bin->mtx);
		malloc_mutex_prof_accum(tsdn, mutex_prof_data, &bin->mtx);
		malloc_mutex_unlock(tsdn, &bin->mtx);
	}
}

void
sec_prefork2(tsdn_t *tsdn, sec_t *sec) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_prefork(tsdn, &bin->mtx);
	}
}

void
sec_postfork_parent(tsdn_t *tsdn, sec_t *sec) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_postfork_parent(tsdn, &bin->mtx);
	}
}

void
sec_postfork_child(tsdn_t *tsdn, sec_t *sec) {
	if (!sec_is_used(sec)) {
		return;
	}
	size_t ntotal_bins = sec->opts.nshards * sec->npsizes;
	for (pszind_t i = 0; i < ntotal_bins; i++) {
		sec_bin_t *bin = &sec->bins[i];
		malloc_mutex_postfork_child(tsdn, &bin->mtx);
	}
}