diff --git a/doc/configuration.txt b/doc/configuration.txt
index 0cc2bdee3..b66f75dd4 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1126,6 +1126,7 @@ The following keywords are supported in the "global" section :
    - tune.maxaccept
    - tune.maxpollevents
    - tune.maxrewrite
+   - tune.memory.hot-size
    - tune.pattern.cache-size
    - tune.peers.max-updates-at-once
    - tune.pipesize
@@ -2983,6 +2984,22 @@ tune.maxrewrite <number>
   larger than that. This means you don't have to worry about it when changing
   bufsize.
 
+tune.memory.hot-size <number>
+  Sets the per-thread amount of memory that will be kept hot in the local cache
+  and will never be recoverable by other threads. Access to this memory is very
+  fast (lockless), and having enough is critical to maintain a good performance
+  level under extreme thread contention. The value is expressed in bytes, and
+  the default value is configured at build time via CONFIG_HAP_POOL_CACHE_SIZE
+  which defaults to 524288 (512 kB). A larger value may increase performance in
+  some usage scenarios, especially when performance profiles show that memory
+  allocation is stressed a lot. Experience shows that a good value sits between
+  once to twice the per CPU core L2 cache size. Too large values will have a
+  negative impact on performance by making inefficient use of the L3 caches in
+  the CPUs, and will consume larger amounts of memory. It is recommended not to
+  change this value, or to proceed in small increments. In order to completely
+  disable the per-thread CPU caches, using a very small value could work, but
+  it is better to use "-dMno-cache" on the command-line.
+
 tune.pattern.cache-size <number>
   Sets the size of the pattern lookup cache to <number> entries. This is an LRU
   cache which reminds previous lookups and their results. It is used by ACLs
diff --git a/doc/internals/api/pools.txt b/doc/internals/api/pools.txt
index 4023dc316..480cf24e5 100644
--- a/doc/internals/api/pools.txt
+++ b/doc/internals/api/pools.txt
@@ -124,13 +124,17 @@ properly handle allocation failures. It may also be enabled at boot time using
 "-dMfail". In this case the desired average rate of allocation failures can be
 fixed by global setting "tune.fail-alloc" expressed in percent.
 
-The thread-local caches contain the freshest objects whose total size amounts
-to CONFIG_HAP_POOL_CACHE_SIZE bytes, which is typically was 1MB before 2.6 and
-is 512kB after. The aim is to keep hot objects that still fit in the CPU core's
-private L2 cache. Once these objects do not fit into the cache anymore, there's
-no benefit keeping them local to the thread, so they'd rather be returned to
-the shared pool or the main allocator so that any other thread may make use of
-them.
+The thread-local caches contain the freshest objects. Its total size amounts to
+the number of bytes set in global.tune.pool_cache_size and that may be adjusted
+by the "tune.memory.hot-size" global option, which itself defaults to build
+time setting CONFIG_HAP_POOL_CACHE_SIZE, which was 1MB before 2.6 and 512kB
+after. The aim is to keep hot objects that still fit in the CPU core's private
+L2 cache. Once these objects do not fit into the cache anymore, there's no
+benefit keeping them local to the thread, so they'd rather be returned to the
+shared pool or the main allocator so that any other thread may make use of
+them. Under extreme thread contention the cost of accessing shared structures
+in the global cache or in malloc() may still be important and it may prove
+useful to increase the thread-local cache size.
 
 
 3. Storage in thread-local caches
@@ -563,14 +567,15 @@ CONFIG_HAP_NO_GLOBAL_POOLS
         boot-time option "-dMno-global".
 
 CONFIG_HAP_POOL_CACHE_SIZE
-        This allows one to define the size of the per-thread cache, in bytes.
-        The default value is 512 kB (524288). Smaller values will use less
-        memory at the expense of a possibly higher CPU usage when using many
-        threads. Higher values will give diminishing returns on performance
-        while using much more memory. Usually there is no benefit in using
-        more than a per-core L2 cache size. It would be better not to set this
-        value lower than a few times the size of a buffer (bufsize, defaults to
-        16 kB).
+        This allows one to define the default size of the per-thread cache, in
+        bytes. The default value is 512 kB (524288). Smaller values will use
+        less memory at the expense of a possibly higher CPU usage when using
+        many threads. Higher values will give diminishing returns on
+        performance while using much more memory. Usually there is no benefit
+        in using more than a per-core L2 cache size. It would be better not to
+        set this value lower than a few times the size of a buffer (bufsize,
+        defaults to 16 kB). In addition, keep in mind that this option may be
+        changed at runtime using "tune.memory.hot-size".
 
 CONFIG_HAP_POOL_CLUSTER_SIZE
         This allows one to define the maximum number of objects that will be
diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h
index 2e9b61b24..11f4b2c0a 100644
--- a/include/haproxy/global-t.h
+++ b/include/haproxy/global-t.h
@@ -160,6 +160,7 @@ struct global {
 		int pool_high_ratio;  /* max ratio of FDs used before we start killing idle connections when creating new connections */
 		int pool_low_count;   /* max number of opened fd before we stop using new idle connections */
 		int pool_high_count;  /* max number of opened fd before we start killing idle connections when creating new connections */
+		size_t pool_cache_size;    /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */
 		unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */
 #ifdef USE_QUIC
 		unsigned int quic_backend_max_idle_timeout;
diff --git a/src/haproxy.c b/src/haproxy.c
index 178f27484..68c78427d 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -2670,6 +2670,14 @@ static void init(int argc, char **argv)
 
 	if (!hlua_post_init())
 		exit(1);
+
+	/* Set the per-thread pool cache size to the default value if not set.
+	 * This is the right place to decide to automatically adjust it (e.g.
+	 * check L2 cache size, thread counts or take into account certain
+	 * expensive pools).
+	 */
+	if (!global.tune.pool_cache_size)
+		global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE;
 }
 
 void deinit(void)
diff --git a/src/pool.c b/src/pool.c
index e225d2144..df9d06090 100644
--- a/src/pool.c
+++ b/src/pool.c
@@ -517,7 +517,7 @@ void pool_evict_from_local_cache(struct pool_head *pool, int full)
 	while ((ph->count && full) ||
 	       (ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE &&
 	        ph->count >= 16 + pool_cache_count / 8 &&
-	        pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+	        pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
 		pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
 	}
 }
@@ -546,7 +546,7 @@ void pool_evict_from_local_caches()
 		BUG_ON(pool != ph->pool);
 
 		pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE);
-	} while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
+	} while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8);
 }
 
 /* Frees an object to the local cache, possibly pushing oldest objects to the
@@ -572,10 +572,10 @@ void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller)
 	pool_cache_count++;
 	pool_cache_bytes += pool->size;
 
-	if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
+	if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) {
 		if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE)
 			pool_evict_from_local_cache(pool, 0);
-		if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE)
+		if (pool_cache_bytes > global.tune.pool_cache_size)
 			pool_evict_from_local_caches();
 	}
 }
@@ -790,7 +790,8 @@ void __pool_free(struct pool_head *pool, void *ptr)
 	}
 #endif
 
-	if (unlikely(pool_debugging & POOL_DBG_NO_CACHE)) {
+	if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) ||
+		     global.tune.pool_cache_size < pool->size)) {
 		pool_free_nocache(pool, ptr);
 		return;
 	}
@@ -1211,6 +1212,26 @@ static int mem_parse_global_fail_alloc(char **args, int section_type, struct pro
 	return 0;
 }
 
+/* config parser for global "tune.memory.hot-size" */
+static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx,
+                                       const struct proxy *defpx, const char *file, int line,
+                                       char **err)
+{
+	long size;
+
+	if (too_many_args(1, args, err, NULL))
+		return -1;
+
+	size = atol(args[1]);
+	if (size <= 0) {
+	    memprintf(err, "'%s' expects a strictly positive value.", args[0]);
+	    return -1;
+	}
+
+	global.tune.pool_cache_size = size;
+	return 0;
+}
+
 /* config parser for global "no-memory-trimming" */
 static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx,
                                        const struct proxy *defpx, const char *file, int line,
@@ -1225,6 +1246,7 @@ static int mem_parse_global_no_mem_trim(char **args, int section_type, struct pr
 /* register global config keywords */
 static struct cfg_kw_list mem_cfg_kws = {ILH, {
 	{ CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc },
+	{ CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size },
 	{ CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim },
 	{ 0, NULL, NULL }
 }};