jemalloc/src/hpdata.c

#include "jemalloc/internal/jemalloc_preamble.h"
#include "jemalloc/internal/jemalloc_internal_includes.h"

#include "jemalloc/internal/hpdata.h"

static int
hpdata_age_comp(const hpdata_t *a, const hpdata_t *b) {
	uint64_t a_age = hpdata_age_get(a);
	uint64_t b_age = hpdata_age_get(b);
	/*
	 * hpdata ages are operation counts in the psset; no two should be the
	 * same.
	 */
	assert(a_age != b_age);
	return (a_age > b_age) - (a_age < b_age);
}

ph_gen(, hpdata_age_heap, hpdata_t, age_link, hpdata_age_comp)

    void hpdata_init(hpdata_t *hpdata, void *addr, uint64_t age, bool is_huge) {
	hpdata_addr_set(hpdata, addr);
	hpdata_age_set(hpdata, age);
	hpdata->h_huge = is_huge;
	hpdata->h_alloc_allowed = true;
	hpdata->h_in_psset_alloc_container = false;
	hpdata->h_purge_allowed = false;
	hpdata->h_hugify_allowed = false;
	hpdata->h_in_psset_hugify_container = false;
	hpdata->h_mid_purge = false;
	hpdata->h_mid_hugify = false;
	hpdata->h_updating = false;
	hpdata->h_in_psset = false;
	hpdata_longest_free_range_set(hpdata, HUGEPAGE_PAGES);
	hpdata->h_nactive = 0;
	fb_init(hpdata->active_pages, HUGEPAGE_PAGES);
	if (is_huge) {
		fb_set_range(
		    hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
		hpdata->h_ntouched = HUGEPAGE_PAGES;
	} else {
		fb_init(hpdata->touched_pages, HUGEPAGE_PAGES);
		hpdata->h_ntouched = 0;
	}
	nstime_init_zero(&hpdata->h_time_purge_allowed);
	hpdata->h_purged_when_empty_and_huge = false;

	hpdata_assert_consistent(hpdata);
}

void *
hpdata_reserve_alloc(hpdata_t *hpdata, size_t sz) {
	hpdata_assert_consistent(hpdata);
	/*
	 * This is a metadata change; the hpdata should therefore either not be
	 * in the psset, or should have explicitly marked itself as being
	 * mid-update.
	 */
	assert(!hpdata->h_in_psset || hpdata->h_updating);
	assert(hpdata->h_alloc_allowed);
	assert((sz & PAGE_MASK) == 0);
	size_t npages = sz >> LG_PAGE;
	assert(npages <= hpdata_longest_free_range_get(hpdata));

	size_t result;

	size_t start = 0;
	/*
	 * These are dead stores, but the compiler will issue warnings on them
	 * since it can't tell statically that found is always true below.
	 */
	size_t begin = 0;
	size_t len = 0;

	size_t largest_unchosen_range = 0;
	while (true) {
		bool found = fb_urange_iter(
		    hpdata->active_pages, HUGEPAGE_PAGES, start, &begin, &len);
		/*
		 * A precondition to this function is that hpdata must be able
		 * to serve the allocation.
		 */
		assert(found);
		assert(len <= hpdata_longest_free_range_get(hpdata));
		if (len >= npages) {
			/*
			 * We use first-fit within the page slabs; this gives
			 * bounded worst-case fragmentation within a slab.  It's
			 * not necessarily right; we could experiment with
			 * various other options.
			 */
			break;
		}
		if (len > largest_unchosen_range) {
			largest_unchosen_range = len;
		}
		start = begin + len;
	}
	/* We found a range; remember it. */
	result = begin;
	fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages);
	hpdata->h_nactive += npages;

	/*
	 * We might be about to dirty some memory for the first time; update our
	 * count if so.
	 */
	size_t new_dirty = fb_ucount(
	    hpdata->touched_pages, HUGEPAGE_PAGES, result, npages);
	fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, result, npages);
	hpdata->h_ntouched += new_dirty;

	/*
	 * If we allocated out of a range that was the longest in the hpdata, it
	 * might be the only one of that size and we'll have to adjust the
	 * metadata.
	 */
	if (len == hpdata_longest_free_range_get(hpdata)) {
		start = begin + npages;
		while (start < HUGEPAGE_PAGES) {
			bool found = fb_urange_iter(hpdata->active_pages,
			    HUGEPAGE_PAGES, start, &begin, &len);
			if (!found) {
				break;
			}
			assert(len <= hpdata_longest_free_range_get(hpdata));
			if (len == hpdata_longest_free_range_get(hpdata)) {
				largest_unchosen_range = len;
				break;
			}
			if (len > largest_unchosen_range) {
				largest_unchosen_range = len;
			}
			start = begin + len;
		}
		hpdata_longest_free_range_set(hpdata, largest_unchosen_range);
	}

	hpdata_assert_consistent(hpdata);
	return (
	    void *)((byte_t *)hpdata_addr_get(hpdata) + (result << LG_PAGE));
}

void
hpdata_unreserve(hpdata_t *hpdata, void *addr, size_t sz) {
	hpdata_assert_consistent(hpdata);
	/* See the comment in reserve. */
	assert(!hpdata->h_in_psset || hpdata->h_updating);
	assert(((uintptr_t)addr & PAGE_MASK) == 0);
	assert((sz & PAGE_MASK) == 0);
	size_t begin = ((uintptr_t)addr - (uintptr_t)hpdata_addr_get(hpdata))
	    >> LG_PAGE;
	assert(begin < HUGEPAGE_PAGES);
	size_t npages = sz >> LG_PAGE;
	size_t old_longest_range = hpdata_longest_free_range_get(hpdata);

	fb_unset_range(hpdata->active_pages, HUGEPAGE_PAGES, begin, npages);
	/* We might have just created a new, larger range. */
	size_t new_begin = (fb_fls(hpdata->active_pages, HUGEPAGE_PAGES, begin)
	    + 1);
	size_t new_end = fb_ffs(
	    hpdata->active_pages, HUGEPAGE_PAGES, begin + npages - 1);
	size_t new_range_len = new_end - new_begin;

	if (new_range_len > old_longest_range) {
		hpdata_longest_free_range_set(hpdata, new_range_len);
	}

	hpdata->h_nactive -= npages;

	hpdata_assert_consistent(hpdata);
}

size_t
hpdata_find_alloc_offsets(hpdata_t *hpdata, size_t sz,
    hpdata_alloc_offset_t *offsets, size_t max_nallocs) {
	hpdata_assert_consistent(hpdata);
	assert((sz & PAGE_MASK) == 0);
	assert(1 <= max_nallocs);
	const size_t npages = sz >> LG_PAGE;
	/* We should be able to find at least one allocation */
	assert(npages <= hpdata_longest_free_range_get(hpdata));

	size_t nallocs = 0;
	size_t start = 0;
	size_t longest_len = 0;
	while (true) {
		size_t begin = 0;
		size_t len = 0;

		const bool found = fb_urange_iter(
		    hpdata->active_pages, HUGEPAGE_PAGES, start, &begin, &len);
		if (!found) {
			/* we should have found at least one */
			assert(0 < nallocs);
			break;
		}

		/* carve up the free range, if it's large enough */
		while (npages <= len) {
			offsets->len_before = len;
			offsets->index = begin;
			offsets->longest_len = longest_len;
			offsets += 1;

			nallocs += 1;
			if (nallocs == max_nallocs) {
				/* cause start to be == HUGEPAGE_PAGES to break out of the outer loop */
				begin = HUGEPAGE_PAGES;
				len = 0;
				break;
			}

			begin += npages;
			len -= npages;
		}

		start = begin + len;
		assert(start <= HUGEPAGE_PAGES);
		if (start == HUGEPAGE_PAGES) {
			break;
		}
		longest_len = max_zu(longest_len, len);
	}

	/* post-conditions */
	assert(1 <= nallocs);
	assert(nallocs <= max_nallocs);

	return nallocs;
}

void *
hpdata_reserve_alloc_offset(
    hpdata_t *hpdata, size_t sz, hpdata_alloc_offset_t *offset) {
	/*
	 * This is a metadata change; the hpdata should therefore either not be
	 * in the psset, or should have explicitly marked itself as being
	 * mid-update.
	 */
	assert(!hpdata->h_in_psset || hpdata->h_updating);
	assert(hpdata->h_alloc_allowed);
	assert((sz & PAGE_MASK) == 0);
	const size_t npages = sz >> LG_PAGE;
	const size_t index = offset->index;

	fb_set_range(hpdata->active_pages, HUGEPAGE_PAGES, index, npages);
	hpdata->h_nactive += npages;

	/*
	 * We might be about to dirty some memory for the first time; update our
	 * count if so.
	 */
	size_t new_dirty = fb_ucount(
	    hpdata->touched_pages, HUGEPAGE_PAGES, index, npages);
	fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, index, npages);
	hpdata->h_ntouched += new_dirty;

	return (void *)((byte_t *)hpdata_addr_get(hpdata) + (index << LG_PAGE));
}

void
hpdata_post_reserve_alloc_offsets(hpdata_t *hpdata, size_t sz,
    hpdata_alloc_offset_t *offsets, size_t nallocs) {
	assert((sz & PAGE_MASK) == 0);
	const size_t npages = sz >> LG_PAGE;

	if (nallocs == 0) {
		return;
	}

	size_t max_len = offsets[0].len_before;
	for (size_t i = 1; i < nallocs; i += 1) {
		max_len = max_zu(max_len, offsets[i].len_before);
	}

	const size_t prev_longest = hpdata_longest_free_range_get(hpdata);
	assert(max_len <= prev_longest);
	if (max_len < prev_longest) {
		/* no need to update the hpdata longest range */
		return;
	}

	/*
	 * If we allocated out of a range that was the longest in the hpdata, it
	 * might be the only one of that size and we'll have to adjust the
	 * metadata.
	 */
	const size_t len_before = offsets[nallocs - 1].len_before;
	size_t       start = offsets[nallocs - 1].index + len_before;
	size_t       longest_len = max_zu(
            offsets[nallocs - 1].longest_len, len_before - npages);

	const size_t rest = HUGEPAGE_PAGES - start;
	/*
	 * Only look at the rest if we think we'll find a range longer than what
	 * we already have. This also implicitly checks for rest == 0, so we don't
	 * have to check before the first call to fb_urange_iter().
	 */
	if (longest_len < rest) {
		while (true) {
			size_t begin = 0;
			size_t len = 0;

			const bool found = fb_urange_iter(hpdata->active_pages,
			    HUGEPAGE_PAGES, start, &begin, &len);
			if (!found) {
				break;
			}

			if (longest_len < len) {
				if (len == prev_longest) {
					/* it's already set to the right value */
					assert(hpdata_longest_free_range_get(
					           hpdata)
					    == len);
					assert(fb_urange_longest(
					           hpdata->active_pages,
					           HUGEPAGE_PAGES)
					    == len);
					return;
				}
				longest_len = len;
			}

			start = begin + len;
			assert(start <= HUGEPAGE_PAGES);
			if (start == HUGEPAGE_PAGES) {
				break;
			}
		}
	}

	assert(fb_urange_longest(hpdata->active_pages, HUGEPAGE_PAGES)
	    == longest_len);
	hpdata_longest_free_range_set(hpdata, longest_len);
}

size_t
hpdata_purge_begin(
    hpdata_t *hpdata, hpdata_purge_state_t *purge_state, size_t *nranges) {
	hpdata_assert_consistent(hpdata);
	/*
	 * See the comment below; we might purge any inactive extent, so it's
	 * unsafe for any other thread to turn any inactive extent active while
	 * we're operating on it.
	 */
	assert(!hpdata_alloc_allowed_get(hpdata));

	purge_state->npurged = 0;
	purge_state->next_purge_search_begin = 0;

	/*
	 * Initialize to_purge.
	 *
	 * It's possible to end up in situations where two dirty extents are
	 * separated by a retained extent:
	 * - 1 page allocated.
	 * - 1 page allocated.
	 * - 1 pages allocated.
	 *
	 * If the middle page is freed and purged, and then the first and third
	 * pages are freed, and then another purge pass happens, the hpdata
	 * looks like this:
	 * - 1 page dirty.
	 * - 1 page retained.
	 * - 1 page dirty.
	 *
	 * But it's safe to do a single 3-page purge.
	 *
	 * We do this by first computing the dirty pages, and then filling in
	 * any gaps by extending each range in the dirty bitmap to extend until
	 * the next active page.  This purges more pages, but the expensive part
	 * of purging is the TLB shootdowns, rather than the kernel state
	 * tracking; doing a little bit more of the latter is fine if it saves
	 * us from doing some of the former.
	 */

	/*
	 * The dirty pages are those that are touched but not active.  Note that
	 * in a normal-ish case, HUGEPAGE_PAGES is something like 512 and the
	 * fb_group_t is 64 bits, so this is 64 bytes, spread across 8
	 * fb_group_ts.
	 */
	fb_group_t dirty_pages[FB_NGROUPS(HUGEPAGE_PAGES)];
	fb_init(dirty_pages, HUGEPAGE_PAGES);
	fb_bit_not(dirty_pages, hpdata->active_pages, HUGEPAGE_PAGES);
	fb_bit_and(
	    dirty_pages, dirty_pages, hpdata->touched_pages, HUGEPAGE_PAGES);

	fb_init(purge_state->to_purge, HUGEPAGE_PAGES);
	size_t next_bit = 0;
	*nranges = 0;
	while (next_bit < HUGEPAGE_PAGES) {
		size_t next_dirty = fb_ffs(
		    dirty_pages, HUGEPAGE_PAGES, next_bit);
		/* Recall that fb_ffs returns nbits if no set bit is found. */
		if (next_dirty == HUGEPAGE_PAGES) {
			break;
		}
		size_t next_active = fb_ffs(
		    hpdata->active_pages, HUGEPAGE_PAGES, next_dirty);
		/*
		 * Don't purge past the end of the dirty extent, into retained
		 * pages.  This helps the kernel a tiny bit, but honestly it's
		 * mostly helpful for testing (where we tend to write test cases
		 * that think in terms of the dirty ranges).
		 */
		ssize_t last_dirty = fb_fls(
		    dirty_pages, HUGEPAGE_PAGES, next_active - 1);
		assert(last_dirty >= 0);
		assert((size_t)last_dirty >= next_dirty);
		assert((size_t)last_dirty - next_dirty + 1 <= HUGEPAGE_PAGES);

		fb_set_range(purge_state->to_purge, HUGEPAGE_PAGES, next_dirty,
		    last_dirty - next_dirty + 1);
		(*nranges)++;
		next_bit = next_active + 1;
	}

	/* We should purge, at least, everything dirty. */
	size_t ndirty = hpdata->h_ntouched - hpdata->h_nactive;
	purge_state->ndirty_to_purge = ndirty;
	assert(ndirty <= fb_scount(
	           purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
	assert(ndirty
	    == fb_scount(dirty_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
	assert(*nranges <= ndirty);
	assert(ndirty == 0 || *nranges > 0);

	hpdata_assert_consistent(hpdata);

	return ndirty;
}

bool
hpdata_purge_next(hpdata_t *hpdata, hpdata_purge_state_t *purge_state,
    void **r_purge_addr, size_t *r_purge_size) {
	/*
	 * Note that we don't have a consistency check here; we're accessing
	 * hpdata without synchronization, and therefore have no right to expect
	 * a consistent state.
	 */
	assert(!hpdata_alloc_allowed_get(hpdata));

	if (purge_state->next_purge_search_begin == HUGEPAGE_PAGES) {
		return false;
	}
	size_t purge_begin;
	size_t purge_len;
	bool found_range = fb_srange_iter(purge_state->to_purge, HUGEPAGE_PAGES,
	    purge_state->next_purge_search_begin, &purge_begin, &purge_len);
	if (!found_range) {
		return false;
	}

	*r_purge_addr = (void *)((byte_t *)hpdata_addr_get(hpdata)
	    + purge_begin * PAGE);
	*r_purge_size = purge_len * PAGE;

	purge_state->next_purge_search_begin = purge_begin + purge_len;
	purge_state->npurged += purge_len;
	assert(purge_state->npurged <= HUGEPAGE_PAGES);

	return true;
}

void
hpdata_purge_end(hpdata_t *hpdata, hpdata_purge_state_t *purge_state) {
	assert(!hpdata_alloc_allowed_get(hpdata));
	hpdata_assert_consistent(hpdata);
	/* See the comment in reserve. */
	assert(!hpdata->h_in_psset || hpdata->h_updating);

	assert(purge_state->npurged
	    == fb_scount(
	        purge_state->to_purge, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES));
	assert(purge_state->npurged >= purge_state->ndirty_to_purge);

	fb_bit_not(
	    purge_state->to_purge, purge_state->to_purge, HUGEPAGE_PAGES);
	fb_bit_and(hpdata->touched_pages, hpdata->touched_pages,
	    purge_state->to_purge, HUGEPAGE_PAGES);
	assert(hpdata->h_ntouched >= purge_state->ndirty_to_purge);
	hpdata->h_ntouched -= purge_state->ndirty_to_purge;

	hpdata_assert_consistent(hpdata);
}

void
hpdata_hugify(hpdata_t *hpdata) {
	hpdata_assert_consistent(hpdata);
	hpdata->h_huge = true;
	fb_set_range(hpdata->touched_pages, HUGEPAGE_PAGES, 0, HUGEPAGE_PAGES);
	hpdata->h_ntouched = HUGEPAGE_PAGES;
	hpdata_assert_consistent(hpdata);
}

void
hpdata_dehugify(hpdata_t *hpdata) {
	hpdata_assert_consistent(hpdata);
	hpdata->h_huge = false;
	hpdata_assert_consistent(hpdata);
}