edata_cache: Allow unbounded fast caching.

The edata_cache_small had a fill/flush heuristic. In retrospect, this was a premature optimization; more testing indicates that an unbounded cache is effectively fine here, and moreover we spend a nontrivial amount of time doing unnecessary filling/flushing. As the HPA takes on a larger and larger fraction of all allocations, any theoretical differences in allocation patterns should shrink. The HPA is more efficient with its metadata in general, so it still comes out ahead on metadata usage anyways.
2026-05-16 18:06:22 +03:00 · 2021-07-23 15:29:43 -07:00 · 2021-07-23 15:29:43 -07:00 · 92a1e38f52
commit 92a1e38f52
parent d93eef2f40
5 changed files with 99 additions and 151 deletions
--- a/src/edata_cache.c
+++ b/src/edata_cache.c
@ -56,39 +56,34 @@ edata_cache_postfork_child(tsdn_t *tsdn, edata_cache_t *edata_cache) {
 }

 void
-edata_cache_small_init(edata_cache_small_t *ecs, edata_cache_t *fallback) {
+edata_cache_fast_init(edata_cache_fast_t *ecs, edata_cache_t *fallback) {
 	edata_list_inactive_init(&ecs->list);
-	ecs->count = 0;
 	ecs->fallback = fallback;
 	ecs->disabled = false;
 }

 static void
-edata_cache_small_try_fill_from_fallback(tsdn_t *tsdn,
-    edata_cache_small_t *ecs) {
-	assert(ecs->count == 0);
+edata_cache_fast_try_fill_from_fallback(tsdn_t *tsdn,
+    edata_cache_fast_t *ecs) {
 	edata_t *edata;
 	malloc_mutex_lock(tsdn, &ecs->fallback->mtx);
-	while (ecs->count < EDATA_CACHE_SMALL_FILL) {
-		edata = edata_avail_first(&ecs->fallback->avail);
+	for (int i = 0; i < EDATA_CACHE_FAST_FILL; i++) {
+		edata = edata_avail_remove_first(&ecs->fallback->avail);
 		if (edata == NULL) {
 			break;
 		}
-		edata_avail_remove(&ecs->fallback->avail, edata);
 		edata_list_inactive_append(&ecs->list, edata);
-		ecs->count++;
 		atomic_load_sub_store_zu(&ecs->fallback->count, 1);
 	}
 	malloc_mutex_unlock(tsdn, &ecs->fallback->mtx);
 }

 edata_t *
-edata_cache_small_get(tsdn_t *tsdn, edata_cache_small_t *ecs) {
+edata_cache_fast_get(tsdn_t *tsdn, edata_cache_fast_t *ecs) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_EDATA_CACHE, 0);

 	if (ecs->disabled) {
-		assert(ecs->count == 0);
 		assert(edata_list_inactive_first(&ecs->list) == NULL);
 		return edata_cache_get(tsdn, ecs->fallback);
 	}
@ -96,15 +91,13 @@ edata_cache_small_get(tsdn_t *tsdn, edata_cache_small_t *ecs) {
 	edata_t *edata = edata_list_inactive_first(&ecs->list);
 	if (edata != NULL) {
 		edata_list_inactive_remove(&ecs->list, edata);
-		ecs->count--;
 		return edata;
 	}
 	/* Slow path; requires synchronization. */
-	edata_cache_small_try_fill_from_fallback(tsdn, ecs);
+	edata_cache_fast_try_fill_from_fallback(tsdn, ecs);
 	edata = edata_list_inactive_first(&ecs->list);
 	if (edata != NULL) {
 		edata_list_inactive_remove(&ecs->list, edata);
-		ecs->count--;
 	} else {
 		/*
 		 * Slowest path (fallback was also empty); allocate something
@ -116,7 +109,7 @@ edata_cache_small_get(tsdn_t *tsdn, edata_cache_small_t *ecs) {
 }

 static void
-edata_cache_small_flush_all(tsdn_t *tsdn, edata_cache_small_t *ecs) {
+edata_cache_fast_flush_all(tsdn_t *tsdn, edata_cache_fast_t *ecs) {
 	/*
 	 * You could imagine smarter cache management policies (like
 	 * only flushing down to some threshold in anticipation of
@ -132,19 +125,16 @@ edata_cache_small_flush_all(tsdn_t *tsdn, edata_cache_small_t *ecs) {
 		edata_avail_insert(&ecs->fallback->avail, edata);
 		nflushed++;
 	}
-	atomic_load_add_store_zu(&ecs->fallback->count, ecs->count);
+	atomic_load_add_store_zu(&ecs->fallback->count, nflushed);
 	malloc_mutex_unlock(tsdn, &ecs->fallback->mtx);
-	assert(nflushed == ecs->count);
-	ecs->count = 0;
 }

 void
-edata_cache_small_put(tsdn_t *tsdn, edata_cache_small_t *ecs, edata_t *edata) {
+edata_cache_fast_put(tsdn_t *tsdn, edata_cache_fast_t *ecs, edata_t *edata) {
 	witness_assert_depth_to_rank(tsdn_witness_tsdp_get(tsdn),
 	    WITNESS_RANK_EDATA_CACHE, 0);

 	if (ecs->disabled) {
-		assert(ecs->count == 0);
 		assert(edata_list_inactive_first(&ecs->list) == NULL);
 		edata_cache_put(tsdn, ecs->fallback, edata);
 		return;
@ -155,15 +145,10 @@ edata_cache_small_put(tsdn_t *tsdn, edata_cache_small_t *ecs, edata_t *edata) {
 	 * cache locality.
 	 */
 	edata_list_inactive_prepend(&ecs->list, edata);
-	ecs->count++;
-	if (ecs->count > EDATA_CACHE_SMALL_MAX) {
-		assert(ecs->count == EDATA_CACHE_SMALL_MAX + 1);
-		edata_cache_small_flush_all(tsdn, ecs);
-	}
 }

 void
-edata_cache_small_disable(tsdn_t *tsdn, edata_cache_small_t *ecs) {
-	edata_cache_small_flush_all(tsdn, ecs);
+edata_cache_fast_disable(tsdn_t *tsdn, edata_cache_fast_t *ecs) {
+	edata_cache_fast_flush_all(tsdn, ecs);
 	ecs->disabled = true;
 }
--- a/src/hpa.c
+++ b/src/hpa.c
@ -187,7 +187,7 @@ hpa_shard_init(hpa_shard_t *shard, hpa_central_t *central, emap_t *emap,
 	assert(edata_cache != NULL);
 	shard->central = central;
 	shard->base = base;
-	edata_cache_small_init(&shard->ecs, edata_cache);
+	edata_cache_fast_init(&shard->ecf, edata_cache);
 	psset_init(&shard->psset);
 	shard->age_counter = 0;
 	shard->ind = ind;
@ -537,7 +537,7 @@ static edata_t *
 hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
    bool *oom) {
 	bool err;
-	edata_t *edata = edata_cache_small_get(tsdn, &shard->ecs);
+	edata_t *edata = edata_cache_fast_get(tsdn, &shard->ecf);
 	if (edata == NULL) {
 		*oom = true;
 		return NULL;
@ -545,7 +545,7 @@ hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,

 	hpdata_t *ps = psset_pick_alloc(&shard->psset, size);
 	if (ps == NULL) {
-		edata_cache_small_put(tsdn, &shard->ecs, edata);
+		edata_cache_fast_put(tsdn, &shard->ecf, edata);
 		return NULL;
 	}

@ -592,7 +592,7 @@ hpa_try_alloc_one_no_grow(tsdn_t *tsdn, hpa_shard_t *shard, size_t size,
 		 * tweaked the stats, but our tweaks weren't really accurate).
 		 */
 		psset_update_end(&shard->psset, ps);
-		edata_cache_small_put(tsdn, &shard->ecs, edata);
+		edata_cache_fast_put(tsdn, &shard->ecf, edata);
 		*oom = true;
 		return NULL;
 	}
@ -805,7 +805,7 @@ hpa_dalloc_locked(tsdn_t *tsdn, hpa_shard_t *shard, edata_t *edata) {
 	assert(ps != NULL);
 	void *unreserve_addr = edata_addr_get(edata);
 	size_t unreserve_size = edata_size_get(edata);
-	edata_cache_small_put(tsdn, &shard->ecs, edata);
+	edata_cache_fast_put(tsdn, &shard->ecf, edata);

 	psset_update_begin(&shard->psset, ps);
 	hpdata_unreserve(ps, unreserve_addr, unreserve_size);
@ -844,7 +844,7 @@ hpa_dalloc(tsdn_t *tsdn, pai_t *self, edata_t *edata) {
 void
 hpa_shard_disable(tsdn_t *tsdn, hpa_shard_t *shard) {
 	malloc_mutex_lock(tsdn, &shard->mtx);
-	edata_cache_small_disable(tsdn, &shard->ecs);
+	edata_cache_fast_disable(tsdn, &shard->ecf);
 	malloc_mutex_unlock(tsdn, &shard->mtx);
 }