jemalloc/src/pages.c

#include "jemalloc/internal/jemalloc_preamble.h"

#include "jemalloc/internal/pages.h"

#include "jemalloc/internal/jemalloc_internal_includes.h"

#include "jemalloc/internal/assert.h"
#include "jemalloc/internal/malloc_io.h"

#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
#	include <sys/sysctl.h>
#	ifdef __FreeBSD__
#		include <vm/vm_param.h>
#	endif
#endif
#ifdef __NetBSD__
#	include <sys/bitops.h> /* ilog2 */
#endif
#ifdef JEMALLOC_HAVE_VM_MAKE_TAG
#	define PAGES_FD_TAG VM_MAKE_TAG(254U)
#else
#	define PAGES_FD_TAG -1
#endif
#if defined(JEMALLOC_HAVE_PRCTL) && defined(JEMALLOC_PAGEID)
#	include <sys/prctl.h>
#	ifndef PR_SET_VMA
#		define PR_SET_VMA 0x53564d41
#		define PR_SET_VMA_ANON_NAME 0
#	endif
#endif

/******************************************************************************/
/* Data. */

/* Actual operating system page size, detected during bootstrap, <= PAGE. */
size_t os_page;

#ifndef _WIN32
#	define PAGES_PROT_COMMIT (PROT_READ | PROT_WRITE)
#	define PAGES_PROT_DECOMMIT (PROT_NONE)
static int mmap_flags;
#endif
static bool os_overcommits;

const char *const thp_mode_names[] = {
    "default", "always", "never", "not supported"};
const char *const system_thp_mode_names[] = {
    "madvise", "always", "never", "not supported"};
thp_mode_t        opt_thp = THP_MODE_DEFAULT;
system_thp_mode_t init_system_thp_mode;

/* Runtime support for lazy purge. Irrelevant when !pages_can_purge_lazy. */
static bool pages_can_purge_lazy_runtime = true;

#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
static int madvise_dont_need_zeros_is_faulty = -1;
/**
 * Check that MADV_DONTNEED will actually zero pages on subsequent access.
 *
 * Since qemu does not support this, yet [1], and you can get very tricky
 * assert if you will run program with jemalloc in use under qemu:
 *
 *     <jemalloc>: ../contrib/jemalloc/src/extent.c:1195: Failed assertion: "p[i] == 0"
 *
 *   [1]: https://patchwork.kernel.org/patch/10576637/
 */
static int
madvise_MADV_DONTNEED_zeroes_pages(void) {
	size_t size = PAGE;

	void *addr = mmap(NULL, size, PROT_READ | PROT_WRITE,
	    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);

	if (addr == MAP_FAILED) {
		malloc_write(
		    "<jemalloc>: Cannot allocate memory for "
		    "MADV_DONTNEED check\n");
		if (opt_abort) {
			abort();
		}
	}

	memset(addr, 'A', size);
	int works;
	if (madvise(addr, size, MADV_DONTNEED) == 0) {
		works = memchr(addr, 'A', size) == NULL;
	} else {
		/*
		 * If madvise() does not support MADV_DONTNEED, then we can
		 * call it anyway, and use it's return code.
		 */
		works = 1;
	}

	if (munmap(addr, size) != 0) {
		malloc_write(
		    "<jemalloc>: Cannot deallocate memory for "
		    "MADV_DONTNEED check\n");
		if (opt_abort) {
			abort();
		}
	}

	return works;
}
#endif

#ifdef JEMALLOC_PAGEID
static int
os_page_id(void *addr, size_t size, const char *name) {
#	ifdef JEMALLOC_HAVE_PRCTL
	/*
	 * While parsing `/proc/<pid>/maps` file, the block could appear as
	 * 7f4836000000-7f4836800000 rw-p 00000000 00:00 0 [anon:jemalloc_pg_overcommit]`
	 */
	return prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)addr, size,
	    (uintptr_t)name);
#	else
	return 0;
#	endif
}
#endif

/******************************************************************************/
/*
 * Function prototypes for static functions that are referenced prior to
 * definition.
 */

static void os_pages_unmap(void *addr, size_t size);

/******************************************************************************/

static void *
os_pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
	assert(ALIGNMENT_CEILING(size, os_page) == size);
	assert(size != 0);

	if (os_overcommits) {
		*commit = true;
	}

	void *ret;
#ifdef _WIN32
	/*
	 * If VirtualAlloc can't allocate at the given address when one is
	 * given, it fails and returns NULL.
	 */
	ret = VirtualAlloc(addr, size, MEM_RESERVE | (*commit ? MEM_COMMIT : 0),
	    PAGE_READWRITE);
#else
	/*
	 * We don't use MAP_FIXED here, because it can cause the *replacement*
	 * of existing mappings, and we only want to create new mappings.
	 */
	{
		int flags = mmap_flags;
#	ifdef __NetBSD__
		/*
		 * On NetBSD PAGE for a platform is defined to the
		 * maximum page size of all machine architectures
		 * for that platform, so that we can use the same
		 * binaries across all machine architectures.
		 */
		if (alignment > os_page || PAGE > os_page) {
			unsigned int a = ilog2(MAX(alignment, PAGE));
			flags |= MAP_ALIGNED(a);
		}
#	endif
		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;

		ret = mmap(addr, size, prot, flags, PAGES_FD_TAG, 0);
	}
	assert(ret != NULL);

	if (ret == MAP_FAILED) {
		ret = NULL;
	} else if (addr != NULL && ret != addr) {
		/*
		 * We succeeded in mapping memory, but not in the right place.
		 */
		os_pages_unmap(ret, size);
		ret = NULL;
	}
#endif
	assert(ret == NULL || (addr == NULL && ret != addr)
	    || (addr != NULL && ret == addr));
#ifdef JEMALLOC_PAGEID
	int n = os_page_id(ret, size,
	    os_overcommits ? "jemalloc_pg_overcommit" : "jemalloc_pg");
	assert(n == 0 || (n == -1 && get_errno() == EINVAL));
#endif
	return ret;
}

static void *
os_pages_trim(
    void *addr, size_t alloc_size, size_t leadsize, size_t size, bool *commit) {
	void *ret = (void *)((byte_t *)addr + leadsize);

	assert(alloc_size >= leadsize + size);
#ifdef _WIN32
	os_pages_unmap(addr, alloc_size);
	void *new_addr = os_pages_map(ret, size, PAGE, commit);
	if (new_addr == ret) {
		return ret;
	}
	if (new_addr != NULL) {
		os_pages_unmap(new_addr, size);
	}
	return NULL;
#else
	size_t trailsize = alloc_size - leadsize - size;

	if (leadsize != 0) {
		os_pages_unmap(addr, leadsize);
	}
	if (trailsize != 0) {
		os_pages_unmap((void *)((byte_t *)ret + size), trailsize);
	}
	return ret;
#endif
}

static void
os_pages_unmap(void *addr, size_t size) {
	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
	assert(ALIGNMENT_CEILING(size, os_page) == size);

#ifdef _WIN32
	if (VirtualFree(addr, 0, MEM_RELEASE) == 0)
#else
	if (munmap(addr, size) == -1)
#endif
	{
		char buf[BUFERROR_BUF];

		buferror(get_errno(), buf, sizeof(buf));
		malloc_printf(
		    "<jemalloc>: Error in "
#ifdef _WIN32
		    "VirtualFree"
#else
		    "munmap"
#endif
		    "(): %s\n",
		    buf);
		if (opt_abort) {
			abort();
		}
	}
}

static void *
pages_map_slow(size_t size, size_t alignment, bool *commit) {
	size_t alloc_size = size + alignment - os_page;
	/* Beware size_t wrap-around. */
	if (alloc_size < size) {
		return NULL;
	}

	void *ret;
	do {
		void *pages = os_pages_map(NULL, alloc_size, alignment, commit);
		if (pages == NULL) {
			return NULL;
		}
		size_t leadsize = ALIGNMENT_CEILING((uintptr_t)pages, alignment)
		    - (uintptr_t)pages;
		ret = os_pages_trim(pages, alloc_size, leadsize, size, commit);
	} while (ret == NULL);

	assert(ret != NULL);
	assert(PAGE_ADDR2BASE(ret) == ret);
	return ret;
}

void *
pages_map(void *addr, size_t size, size_t alignment, bool *commit) {
	assert(alignment >= PAGE);
	assert(ALIGNMENT_ADDR2BASE(addr, alignment) == addr);

#if defined(__FreeBSD__) && defined(MAP_EXCL)
	/*
	 * FreeBSD has mechanisms both to mmap at specific address without
	 * touching existing mappings, and to mmap with specific alignment.
	 */
	{
		if (os_overcommits) {
			*commit = true;
		}

		int prot = *commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
		int flags = mmap_flags;

		if (addr != NULL) {
			flags |= MAP_FIXED | MAP_EXCL;
		} else {
			unsigned alignment_bits = ffs_zu(alignment);
			assert(alignment_bits > 0);
			flags |= MAP_ALIGNED(alignment_bits);
		}

		void *ret = mmap(addr, size, prot, flags, -1, 0);
		if (ret == MAP_FAILED) {
			ret = NULL;
		}

		return ret;
	}
#endif
	/*
	 * Ideally, there would be a way to specify alignment to mmap() (like
	 * NetBSD has), but in the absence of such a feature, we have to work
	 * hard to efficiently create aligned mappings.  The reliable, but
	 * slow method is to create a mapping that is over-sized, then trim the
	 * excess.  However, that always results in one or two calls to
	 * os_pages_unmap(), and it can leave holes in the process's virtual
	 * memory map if memory grows downward.
	 *
	 * Optimistically try mapping precisely the right amount before falling
	 * back to the slow method, with the expectation that the optimistic
	 * approach works most of the time.
	 */

	void *ret = os_pages_map(addr, size, os_page, commit);
	if (ret == NULL || ret == addr) {
		return ret;
	}
	assert(addr == NULL);
	if (ALIGNMENT_ADDR2OFFSET(ret, alignment) != 0) {
		os_pages_unmap(ret, size);
		return pages_map_slow(size, alignment, commit);
	}

	assert(PAGE_ADDR2BASE(ret) == ret);
	return ret;
}

void
pages_unmap(void *addr, size_t size) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);

	os_pages_unmap(addr, size);
}

static bool
os_pages_commit(void *addr, size_t size, bool commit) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);

#ifdef _WIN32
	return (commit
	        ? (addr != VirtualAlloc(addr, size, MEM_COMMIT, PAGE_READWRITE))
	        : (!VirtualFree(addr, size, MEM_DECOMMIT)));
#else
	{
		int   prot = commit ? PAGES_PROT_COMMIT : PAGES_PROT_DECOMMIT;
		void *result = mmap(
		    addr, size, prot, mmap_flags | MAP_FIXED, PAGES_FD_TAG, 0);
		if (result == MAP_FAILED) {
			return true;
		}
		if (result != addr) {
			/*
			 * We succeeded in mapping memory, but not in the right
			 * place.
			 */
			os_pages_unmap(result, size);
			return true;
		}
		return false;
	}
#endif
}

static bool
pages_commit_impl(void *addr, size_t size, bool commit) {
	if (os_overcommits) {
		return true;
	}

	return os_pages_commit(addr, size, commit);
}

bool
pages_commit(void *addr, size_t size) {
	return pages_commit_impl(addr, size, true);
}

bool
pages_decommit(void *addr, size_t size) {
	return pages_commit_impl(addr, size, false);
}

void
pages_mark_guards(void *head, void *tail) {
	assert(head != NULL || tail != NULL);
	assert(
	    head == NULL || tail == NULL || (uintptr_t)head < (uintptr_t)tail);
#ifdef JEMALLOC_HAVE_MPROTECT
	if (head != NULL) {
		mprotect(head, PAGE, PROT_NONE);
	}
	if (tail != NULL) {
		mprotect(tail, PAGE, PROT_NONE);
	}
#else
	/* Decommit sets to PROT_NONE / MEM_DECOMMIT. */
	if (head != NULL) {
		os_pages_commit(head, PAGE, false);
	}
	if (tail != NULL) {
		os_pages_commit(tail, PAGE, false);
	}
#endif
}

void
pages_unmark_guards(void *head, void *tail) {
	assert(head != NULL || tail != NULL);
	assert(
	    head == NULL || tail == NULL || (uintptr_t)head < (uintptr_t)tail);
#ifdef JEMALLOC_HAVE_MPROTECT
	bool   head_and_tail = (head != NULL) && (tail != NULL);
	size_t range = head_and_tail ? (uintptr_t)tail - (uintptr_t)head + PAGE
	                             : SIZE_T_MAX;
	/*
	 * The amount of work that the kernel does in mprotect depends on the
	 * range argument.  SC_LARGE_MINCLASS is an arbitrary threshold chosen
	 * to prevent kernel from doing too much work that would outweigh the
	 * savings of performing one less system call.
	 */
	bool ranged_mprotect = head_and_tail && range <= SC_LARGE_MINCLASS;
	if (ranged_mprotect) {
		mprotect(head, range, PROT_READ | PROT_WRITE);
	} else {
		if (head != NULL) {
			mprotect(head, PAGE, PROT_READ | PROT_WRITE);
		}
		if (tail != NULL) {
			mprotect(tail, PAGE, PROT_READ | PROT_WRITE);
		}
	}
#else
	if (head != NULL) {
		os_pages_commit(head, PAGE, true);
	}
	if (tail != NULL) {
		os_pages_commit(tail, PAGE, true);
	}
#endif
}

bool
pages_purge_lazy(void *addr, size_t size) {
	assert(ALIGNMENT_ADDR2BASE(addr, os_page) == addr);
	assert(PAGE_CEILING(size) == size);

	if (!pages_can_purge_lazy) {
		return true;
	}
	if (!pages_can_purge_lazy_runtime) {
		/*
		 * Built with lazy purge enabled, but detected it was not
		 * supported on the current system.
		 */
		return true;
	}

#ifdef _WIN32
	VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
	return false;
#elif defined(JEMALLOC_PURGE_MADVISE_FREE)
	return (madvise(addr, size,
#	ifdef MADV_FREE
	            MADV_FREE
#	else
	            JEMALLOC_MADV_FREE
#	endif
	            )
	    != 0);
#elif defined(JEMALLOC_PURGE_MADVISE_DONTNEED)                                 \
    && !defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
	return (madvise(addr, size, MADV_DONTNEED) != 0);
#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED)                           \
    && !defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS)
	return (posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0);
#else
	not_reached();
#endif
}

bool
pages_purge_forced(void *addr, size_t size) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);

	if (!pages_can_purge_forced) {
		return true;
	}

#if defined(JEMALLOC_PURGE_MADVISE_DONTNEED)                                   \
    && defined(JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS)
	return (unlikely(madvise_dont_need_zeros_is_faulty)
	    || madvise(addr, size, MADV_DONTNEED) != 0);
#elif defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED)                           \
    && defined(JEMALLOC_PURGE_POSIX_MADVISE_DONTNEED_ZEROS)
	return (unlikely(madvise_dont_need_zeros_is_faulty)
	    || posix_madvise(addr, size, POSIX_MADV_DONTNEED) != 0);
#elif defined(JEMALLOC_MAPS_COALESCE)
	/* Try to overlay a new demand-zeroed mapping. */
	return pages_commit(addr, size);
#else
	not_reached();
#endif
}

static bool
pages_huge_impl(void *addr, size_t size, bool aligned) {
	if (aligned) {
		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
		assert(HUGEPAGE_CEILING(size) == size);
	}
#if defined(JEMALLOC_HAVE_MADVISE_HUGE)
	return (madvise(addr, size, MADV_HUGEPAGE) != 0);
#elif defined(JEMALLOC_HAVE_MEMCNTL)
	struct memcntl_mha m = {0};
	m.mha_cmd = MHA_MAPSIZE_VA;
	m.mha_pagesize = HUGEPAGE;
	return (memcntl(addr, size, MC_HAT_ADVISE, (caddr_t)&m, 0, 0) == 0);
#else
	return true;
#endif
}

bool
pages_huge(void *addr, size_t size) {
	return pages_huge_impl(addr, size, true);
}

static bool
pages_huge_unaligned(void *addr, size_t size) {
	return pages_huge_impl(addr, size, false);
}

static bool
pages_nohuge_impl(void *addr, size_t size, bool aligned) {
	if (aligned) {
		assert(HUGEPAGE_ADDR2BASE(addr) == addr);
		assert(HUGEPAGE_CEILING(size) == size);
	}

#ifdef JEMALLOC_HAVE_MADVISE_HUGE
	return (madvise(addr, size, MADV_NOHUGEPAGE) != 0);
#else
	return false;
#endif
}

bool
pages_nohuge(void *addr, size_t size) {
	return pages_nohuge_impl(addr, size, true);
}

static bool
pages_nohuge_unaligned(void *addr, size_t size) {
	return pages_nohuge_impl(addr, size, false);
}

bool
pages_collapse(void *addr, size_t size) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);
	/*
	 * There is one more MADV_COLLAPSE precondition that is not easy to
	 * express with assert statement.  In order to madvise(addr, size,
	 * MADV_COLLAPSE) call to be successful, at least one page in the range
	 * must currently be backed by physical memory.  In particularly, this
	 * means we can't call pages_collapse on freshly mapped memory region.
	 * See madvise(2) man page for more details.
	 */
#if defined(JEMALLOC_HAVE_MADVISE_COLLAPSE)                                    \
    && (defined(MADV_COLLAPSE) || defined(JEMALLOC_MADV_COLLAPSE))
#	if defined(MADV_COLLAPSE)
	return (madvise(addr, size, MADV_COLLAPSE) != 0);
#	elif defined(JEMALLOC_MADV_COLLAPSE)
	return (madvise(addr, size, JEMALLOC_MADV_COLLAPSE) != 0);
#	endif
#else
	return true;
#endif
}

bool
pages_dontdump(void *addr, size_t size) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);
#if defined(JEMALLOC_MADVISE_DONTDUMP)
	return madvise(addr, size, MADV_DONTDUMP) != 0;
#elif defined(JEMALLOC_MADVISE_NOCORE)
	return madvise(addr, size, MADV_NOCORE) != 0;
#else
	return false;
#endif
}

bool
pages_dodump(void *addr, size_t size) {
	assert(PAGE_ADDR2BASE(addr) == addr);
	assert(PAGE_CEILING(size) == size);
#if defined(JEMALLOC_MADVISE_DONTDUMP)
	return madvise(addr, size, MADV_DODUMP) != 0;
#elif defined(JEMALLOC_MADVISE_NOCORE)
	return madvise(addr, size, MADV_CORE) != 0;
#else
	return false;
#endif
}

#ifdef JEMALLOC_HAVE_PROCESS_MADVISE
#	include <sys/mman.h>
#	include <sys/syscall.h>

#	ifndef PIDFD_SELF
#		define PIDFD_SELF -10000
#	endif

static atomic_b_t process_madvise_gate = ATOMIC_INIT(true);

static bool
init_process_madvise(void) {
	if (opt_process_madvise_max_batch == 0) {
		return false;
	}

	if (opt_process_madvise_max_batch > PROCESS_MADVISE_MAX_BATCH_LIMIT) {
		opt_process_madvise_max_batch = PROCESS_MADVISE_MAX_BATCH_LIMIT;
	}

	return false;
}

#	ifdef SYS_process_madvise
#		define JE_SYS_PROCESS_MADVISE_NR SYS_process_madvise
#	else
#		define JE_SYS_PROCESS_MADVISE_NR                              \
			EXPERIMENTAL_SYS_PROCESS_MADVISE_NR
#	endif

static bool
pages_purge_process_madvise_impl(
    void *vec, size_t vec_len, size_t total_bytes) {
	if (!atomic_load_b(&process_madvise_gate, ATOMIC_RELAXED)) {
		return true;
	}

	/*
	 * TODO: remove this save/restore of errno after supporting errno
	 * preservation for free() call properly.
	 */
	int    saved_errno = get_errno();
	size_t purged_bytes = (size_t)syscall(JE_SYS_PROCESS_MADVISE_NR,
	    PIDFD_SELF, (struct iovec *)vec, vec_len, MADV_DONTNEED, 0);
	if (purged_bytes == (size_t)-1) {
		if (errno == EPERM || errno == EINVAL || errno == ENOSYS
		    || errno == EBADF) {
			/* Process madvise not supported the way we need it. */
			atomic_store_b(
			    &process_madvise_gate, false, ATOMIC_RELAXED);
		}
		set_errno(saved_errno);
	}

	return purged_bytes != total_bytes;
}

#else

static bool
init_process_madvise(void) {
	return false;
}

static bool
pages_purge_process_madvise_impl(
    void *vec, size_t vec_len, size_t total_bytes) {
	not_reached();
	return true;
}

#endif

bool
pages_purge_process_madvise(void *vec, size_t vec_len, size_t total_bytes) {
	return pages_purge_process_madvise_impl(vec, vec_len, total_bytes);
}

static size_t
os_page_detect(void) {
#ifdef _WIN32
	SYSTEM_INFO si;
	GetSystemInfo(&si);
	return si.dwPageSize;
#elif defined(__FreeBSD__)
	/*
	 * This returns the value obtained from
	 * the auxv vector, avoiding a syscall.
	 */
	return getpagesize();
#else
	long result = sysconf(_SC_PAGESIZE);
	if (result == -1) {
		return LG_PAGE;
	}
	return (size_t)result;
#endif
}

#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
static bool
os_overcommits_sysctl(void) {
	int    vm_overcommit;
	size_t sz;

	sz = sizeof(vm_overcommit);
#	if defined(__FreeBSD__) && defined(VM_OVERCOMMIT)
	int mib[2];

	mib[0] = CTL_VM;
	mib[1] = VM_OVERCOMMIT;
	if (sysctl(mib, 2, &vm_overcommit, &sz, NULL, 0) != 0) {
		return false; /* Error. */
	}
#	else
	if (sysctlbyname("vm.overcommit", &vm_overcommit, &sz, NULL, 0) != 0) {
		return false; /* Error. */
	}
#	endif

	return ((vm_overcommit & 0x3) == 0);
}
#endif

#ifdef JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY
static bool
os_overcommits_proc(void) {
	int  fd;
	char buf[1];

#	if defined(O_CLOEXEC)
	fd = malloc_open(
	    "/proc/sys/vm/overcommit_memory", O_RDONLY | O_CLOEXEC);
#	else
	fd = malloc_open("/proc/sys/vm/overcommit_memory", O_RDONLY);
	if (fd != -1) {
		fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
	}
#	endif

	if (fd == -1) {
		return false; /* Error. */
	}

	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
	malloc_close(fd);

	if (nread < 1) {
		return false; /* Error. */
	}
	/*
	 * /proc/sys/vm/overcommit_memory meanings:
	 * 0: Heuristic overcommit.
	 * 1: Always overcommit.
	 * 2: Never overcommit.
	 */
	return (buf[0] == '0' || buf[0] == '1');
}
#endif

static bool
pages_should_skip_set_thp_state() {
	if (opt_thp == thp_mode_do_nothing
	    || (opt_thp == thp_mode_always
	        && init_system_thp_mode == system_thp_mode_always)
	    || (opt_thp == thp_mode_never
	        && init_system_thp_mode == system_thp_mode_never)) {
		return true;
	}
	return false;
}
void
pages_set_thp_state(void *ptr, size_t size) {
	if (pages_should_skip_set_thp_state()) {
		return;
	}
	assert(opt_thp != thp_mode_not_supported
	    && init_system_thp_mode != system_thp_mode_not_supported);

	if (opt_thp == thp_mode_always
	    && init_system_thp_mode == system_thp_mode_madvise) {
		pages_huge_unaligned(ptr, size);
	} else if (opt_thp == thp_mode_never) {
		assert(init_system_thp_mode == system_thp_mode_madvise
		    || init_system_thp_mode == system_thp_mode_always);
		pages_nohuge_unaligned(ptr, size);
	}
}

static void
init_thp_state(void) {
	if (!have_madvise_huge && !have_memcntl) {
		if (metadata_thp_enabled() && opt_abort) {
			malloc_write("<jemalloc>: no MADV_HUGEPAGE support\n");
			abort();
		}
		goto label_error;
	}
#if defined(JEMALLOC_HAVE_MADVISE_HUGE)
	static const char sys_state_madvise[] = "always [madvise] never\n";
	static const char sys_state_always[] = "[always] madvise never\n";
	static const char sys_state_never[] = "always madvise [never]\n";
	char              buf[sizeof(sys_state_madvise)];

	int fd = malloc_open(
	    "/sys/kernel/mm/transparent_hugepage/enabled", O_RDONLY);
	if (fd == -1) {
		goto label_error;
	}

	ssize_t nread = malloc_read_fd(fd, &buf, sizeof(buf));
	malloc_close(fd);
	if (nread < 0) {
		goto label_error;
	}

	if (strncmp(buf, sys_state_madvise, (size_t)nread) == 0) {
		init_system_thp_mode = system_thp_mode_madvise;
	} else if (strncmp(buf, sys_state_always, (size_t)nread) == 0) {
		init_system_thp_mode = system_thp_mode_always;
	} else if (strncmp(buf, sys_state_never, (size_t)nread) == 0) {
		init_system_thp_mode = system_thp_mode_never;
	} else {
		goto label_error;
	}
	if (opt_hpa_opts.hugify_style == hpa_hugify_style_auto) {
		if (init_system_thp_mode == system_thp_mode_madvise) {
			opt_hpa_opts.hugify_style = hpa_hugify_style_lazy;
		} else {
			opt_hpa_opts.hugify_style = hpa_hugify_style_none;
		}
	}
	return;
#elif defined(JEMALLOC_HAVE_MEMCNTL)
	init_system_thp_mode = system_thp_mode_madvise;
	if (opt_hpa_opts.hugify_style == hpa_hugify_style_auto) {
		opt_hpa_opts.hugify_style = hpa_hugify_style_eager;
	}
	return;
#endif
label_error:
	opt_thp = thp_mode_not_supported;
	init_system_thp_mode = system_thp_mode_not_supported;
}

bool
pages_boot(void) {
	os_page = os_page_detect();
	if (os_page > PAGE) {
		malloc_write("<jemalloc>: Unsupported system page size\n");
		if (opt_abort) {
			abort();
		}
		return true;
	}

#ifdef JEMALLOC_PURGE_MADVISE_DONTNEED_ZEROS
	if (!opt_trust_madvise) {
		madvise_dont_need_zeros_is_faulty =
		    !madvise_MADV_DONTNEED_zeroes_pages();
		if (madvise_dont_need_zeros_is_faulty) {
			malloc_write(
			    "<jemalloc>: MADV_DONTNEED does not work (memset will be used instead)\n");
			malloc_write(
			    "<jemalloc>: (This is the expected behaviour if you are running under QEMU)\n");
		}
	} else {
		/* In case opt_trust_madvise is disable,
		 * do not do runtime check */
		madvise_dont_need_zeros_is_faulty = 0;
	}
#endif

#ifndef _WIN32
	mmap_flags = MAP_PRIVATE | MAP_ANON;
#endif

#ifdef JEMALLOC_SYSCTL_VM_OVERCOMMIT
	os_overcommits = os_overcommits_sysctl();
#elif defined(JEMALLOC_PROC_SYS_VM_OVERCOMMIT_MEMORY)
	os_overcommits = os_overcommits_proc();
#	ifdef MAP_NORESERVE
	if (os_overcommits) {
		mmap_flags |= MAP_NORESERVE;
	}
#	endif
#elif defined(__NetBSD__)
	os_overcommits = true;
#else
	os_overcommits = false;
#endif

	init_thp_state();

#ifdef __FreeBSD__
	/*
	 * FreeBSD doesn't need the check; madvise(2) is known to work.
	 */
#else
	/* Detect lazy purge runtime support. */
	if (pages_can_purge_lazy) {
		bool  committed = false;
		void *madv_free_page = os_pages_map(
		    NULL, PAGE, PAGE, &committed);
		if (madv_free_page == NULL) {
			return true;
		}
		assert(pages_can_purge_lazy_runtime);
		if (pages_purge_lazy(madv_free_page, PAGE)) {
			pages_can_purge_lazy_runtime = false;
		}
		os_pages_unmap(madv_free_page, PAGE);
	}
#endif
	if (init_process_madvise()) {
		if (opt_abort) {
			abort();
		}
		return true;
	}

	return false;
}