diff --git a/doc/configuration.txt b/doc/configuration.txt index 0cc2bdee3..b66f75dd4 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -1126,6 +1126,7 @@ The following keywords are supported in the "global" section : - tune.maxaccept - tune.maxpollevents - tune.maxrewrite + - tune.memory.hot-size - tune.pattern.cache-size - tune.peers.max-updates-at-once - tune.pipesize @@ -2983,6 +2984,22 @@ tune.maxrewrite larger than that. This means you don't have to worry about it when changing bufsize. +tune.memory.hot-size + Sets the per-thread amount of memory that will be kept hot in the local cache + and will never be recoverable by other threads. Access to this memory is very + fast (lockless), and having enough is critical to maintain a good performance + level under extreme thread contention. The value is expressed in bytes, and + the default value is configured at build time via CONFIG_HAP_POOL_CACHE_SIZE + which defaults to 524288 (512 kB). A larger value may increase performance in + some usage scenarios, especially when performance profiles show that memory + allocation is stressed a lot. Experience shows that a good value sits between + once to twice the per CPU core L2 cache size. Too large values will have a + negative impact on performance by making inefficient use of the L3 caches in + the CPUs, and will consume larger amounts of memory. It is recommended not to + change this value, or to proceed in small increments. In order to completely + disable the per-thread CPU caches, using a very small value could work, but + it is better to use "-dMno-cache" on the command-line. + tune.pattern.cache-size Sets the size of the pattern lookup cache to entries. This is an LRU cache which reminds previous lookups and their results. It is used by ACLs diff --git a/doc/internals/api/pools.txt b/doc/internals/api/pools.txt index 4023dc316..480cf24e5 100644 --- a/doc/internals/api/pools.txt +++ b/doc/internals/api/pools.txt @@ -124,13 +124,17 @@ properly handle allocation failures. It may also be enabled at boot time using "-dMfail". In this case the desired average rate of allocation failures can be fixed by global setting "tune.fail-alloc" expressed in percent. -The thread-local caches contain the freshest objects whose total size amounts -to CONFIG_HAP_POOL_CACHE_SIZE bytes, which is typically was 1MB before 2.6 and -is 512kB after. The aim is to keep hot objects that still fit in the CPU core's -private L2 cache. Once these objects do not fit into the cache anymore, there's -no benefit keeping them local to the thread, so they'd rather be returned to -the shared pool or the main allocator so that any other thread may make use of -them. +The thread-local caches contain the freshest objects. Its total size amounts to +the number of bytes set in global.tune.pool_cache_size and that may be adjusted +by the "tune.memory.hot-size" global option, which itself defaults to build +time setting CONFIG_HAP_POOL_CACHE_SIZE, which was 1MB before 2.6 and 512kB +after. The aim is to keep hot objects that still fit in the CPU core's private +L2 cache. Once these objects do not fit into the cache anymore, there's no +benefit keeping them local to the thread, so they'd rather be returned to the +shared pool or the main allocator so that any other thread may make use of +them. Under extreme thread contention the cost of accessing shared structures +in the global cache or in malloc() may still be important and it may prove +useful to increase the thread-local cache size. 3. Storage in thread-local caches @@ -563,14 +567,15 @@ CONFIG_HAP_NO_GLOBAL_POOLS boot-time option "-dMno-global". CONFIG_HAP_POOL_CACHE_SIZE - This allows one to define the size of the per-thread cache, in bytes. - The default value is 512 kB (524288). Smaller values will use less - memory at the expense of a possibly higher CPU usage when using many - threads. Higher values will give diminishing returns on performance - while using much more memory. Usually there is no benefit in using - more than a per-core L2 cache size. It would be better not to set this - value lower than a few times the size of a buffer (bufsize, defaults to - 16 kB). + This allows one to define the default size of the per-thread cache, in + bytes. The default value is 512 kB (524288). Smaller values will use + less memory at the expense of a possibly higher CPU usage when using + many threads. Higher values will give diminishing returns on + performance while using much more memory. Usually there is no benefit + in using more than a per-core L2 cache size. It would be better not to + set this value lower than a few times the size of a buffer (bufsize, + defaults to 16 kB). In addition, keep in mind that this option may be + changed at runtime using "tune.memory.hot-size". CONFIG_HAP_POOL_CLUSTER_SIZE This allows one to define the maximum number of objects that will be diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h index 2e9b61b24..11f4b2c0a 100644 --- a/include/haproxy/global-t.h +++ b/include/haproxy/global-t.h @@ -160,6 +160,7 @@ struct global { int pool_high_ratio; /* max ratio of FDs used before we start killing idle connections when creating new connections */ int pool_low_count; /* max number of opened fd before we stop using new idle connections */ int pool_high_count; /* max number of opened fd before we start killing idle connections when creating new connections */ + size_t pool_cache_size; /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */ unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */ #ifdef USE_QUIC unsigned int quic_backend_max_idle_timeout; diff --git a/src/haproxy.c b/src/haproxy.c index 178f27484..68c78427d 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -2670,6 +2670,14 @@ static void init(int argc, char **argv) if (!hlua_post_init()) exit(1); + + /* Set the per-thread pool cache size to the default value if not set. + * This is the right place to decide to automatically adjust it (e.g. + * check L2 cache size, thread counts or take into account certain + * expensive pools). + */ + if (!global.tune.pool_cache_size) + global.tune.pool_cache_size = CONFIG_HAP_POOL_CACHE_SIZE; } void deinit(void) diff --git a/src/pool.c b/src/pool.c index e225d2144..df9d06090 100644 --- a/src/pool.c +++ b/src/pool.c @@ -517,7 +517,7 @@ void pool_evict_from_local_cache(struct pool_head *pool, int full) while ((ph->count && full) || (ph->count >= CONFIG_HAP_POOL_CLUSTER_SIZE && ph->count >= 16 + pool_cache_count / 8 && - pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) { + pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) { pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE); } } @@ -546,7 +546,7 @@ void pool_evict_from_local_caches() BUG_ON(pool != ph->pool); pool_evict_last_items(pool, ph, CONFIG_HAP_POOL_CLUSTER_SIZE); - } while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8); + } while (pool_cache_bytes > global.tune.pool_cache_size * 7 / 8); } /* Frees an object to the local cache, possibly pushing oldest objects to the @@ -572,10 +572,10 @@ void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller) pool_cache_count++; pool_cache_bytes += pool->size; - if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) { + if (unlikely(pool_cache_bytes > global.tune.pool_cache_size * 3 / 4)) { if (ph->count >= 16 + pool_cache_count / 8 + CONFIG_HAP_POOL_CLUSTER_SIZE) pool_evict_from_local_cache(pool, 0); - if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE) + if (pool_cache_bytes > global.tune.pool_cache_size) pool_evict_from_local_caches(); } } @@ -790,7 +790,8 @@ void __pool_free(struct pool_head *pool, void *ptr) } #endif - if (unlikely(pool_debugging & POOL_DBG_NO_CACHE)) { + if (unlikely((pool_debugging & POOL_DBG_NO_CACHE) || + global.tune.pool_cache_size < pool->size)) { pool_free_nocache(pool, ptr); return; } @@ -1211,6 +1212,26 @@ static int mem_parse_global_fail_alloc(char **args, int section_type, struct pro return 0; } +/* config parser for global "tune.memory.hot-size" */ +static int mem_parse_global_hot_size(char **args, int section_type, struct proxy *curpx, + const struct proxy *defpx, const char *file, int line, + char **err) +{ + long size; + + if (too_many_args(1, args, err, NULL)) + return -1; + + size = atol(args[1]); + if (size <= 0) { + memprintf(err, "'%s' expects a strictly positive value.", args[0]); + return -1; + } + + global.tune.pool_cache_size = size; + return 0; +} + /* config parser for global "no-memory-trimming" */ static int mem_parse_global_no_mem_trim(char **args, int section_type, struct proxy *curpx, const struct proxy *defpx, const char *file, int line, @@ -1225,6 +1246,7 @@ static int mem_parse_global_no_mem_trim(char **args, int section_type, struct pr /* register global config keywords */ static struct cfg_kw_list mem_cfg_kws = {ILH, { { CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc }, + { CFG_GLOBAL, "tune.memory.hot-size", mem_parse_global_hot_size }, { CFG_GLOBAL, "no-memory-trimming", mem_parse_global_no_mem_trim }, { 0, NULL, NULL } }};