This patch limits the number of zpools used by zswap on smaller systems. Currently, zswap allocates 32 pools unconditionally. This was implemented to reduce contention on per-zpool locks. However, it incurs allocation overhead by distributing pages across pools, wasting memory on systems with fewer CPUs and less RAM. This patch allocates approximately 2*CPU zpools, with a minimum of 1 zpool for single-CPU systems and up to 32 zpools for systems with 16 or more CPUs. This number is sufficient to keep the probability of busy-waiting by a thread under 40%. The upper limit of 32 zpools remains unchanged. For memory, it limits to 1 zpool per 60MB of memory for the 20% default max pool size limit, assuming the best case with no fragmentation in zspages. It expects 90% pool usage for zsmalloc. Signed-off-by: Takero Funaki <flintglass@xxxxxxxxx> --- mm/zswap.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 60 insertions(+), 7 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 4de342a63bc2..e957bfdeaf70 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -124,8 +124,11 @@ static unsigned int zswap_accept_thr_percent = 90; /* of max pool size */ module_param_named(accept_threshold_percent, zswap_accept_thr_percent, uint, 0644); -/* Number of zpools in zswap_pool (empirically determined for scalability) */ -#define ZSWAP_NR_ZPOOLS 32 +/* + * Number of max zpools in zswap_pool (empirically determined for scalability) + * This must be order of 2, for pointer hashing. + */ +#define ZSWAP_NR_ZPOOLS_MAX 32 /* Enable/disable memory pressure-based shrinker. */ static bool zswap_shrinker_enabled = IS_ENABLED( @@ -157,12 +160,13 @@ struct crypto_acomp_ctx { * needs to be verified that it's still valid in the tree. */ struct zswap_pool { - struct zpool *zpools[ZSWAP_NR_ZPOOLS]; + struct zpool *zpools[ZSWAP_NR_ZPOOLS_MAX]; struct crypto_acomp_ctx __percpu *acomp_ctx; struct percpu_ref ref; struct list_head list; struct work_struct release_work; struct hlist_node node; + unsigned char nr_zpools_order; char tfm_name[CRYPTO_MAX_ALG_NAME]; }; @@ -243,11 +247,55 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp) pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \ zpool_get_type((p)->zpools[0])) +static unsigned long zswap_max_pages(void); + /********************************* * pool functions **********************************/ static void __zswap_pool_empty(struct percpu_ref *ref); +/* + * Estimate the optimal number of zpools based on CPU and memory. + * + * For CPUs, aim for 40% or lower probability of busy-waiting from a thread, + * assuming all cores are accessing zswap concurrently. + * The threshold is chosen for the simplicity of the formula: + * The probability is 1-(1-(1/pool))^(thr-1). For 40% threshold, this is + * approximately pool = 2 * threads rounded up to orders of 2. + * Threads \ Pools + * 2 4 8 16 32 + * 2 0.50 0.25 < 0.13 0.06 0.03 + * 4 0.88 0.58 0.33 < 0.18 0.09 + * 6 0.97 0.76 0.49 0.28 < 0.15 + * 8 0.99 0.87 0.61 0.36 < 0.20 + * 10 1.00 0.92 0.70 0.44 0.25 < + * 16 1.00 0.99 0.87 0.62 0.38 < + * 18 1.00 0.99 0.90 0.67 0.42 + * + * For memory, expect 90% pool usage for zsmalloc in the best case. + * Assuming uniform distribution, we need to store: + * 590 : sum of pages_per_zspage + * * 0.5 : about half of zspage is empty if no fragmentation + * / (1-0.9) : 90% target usage + * = 2950 : expected max pages of a zpool, + * equivalent to 60MB RAM for a 20% max_pool_percent. + */ +static void __zswap_set_nr_zpools(struct zswap_pool *pool) +{ + unsigned long mem = zswap_max_pages(); + unsigned long cpu = num_online_cpus(); + + mem = DIV_ROUND_UP(mem, 2950); + mem = min(max(1, mem), ZSWAP_NR_ZPOOLS_MAX); + + if (cpu <= 1) + cpu = 1; + else + cpu = 1 << ilog2(min(cpu * 2, ZSWAP_NR_ZPOOLS_MAX); + + pool->nr_zpools_order = ilog2(min(mem, cpu)); +} + static struct zswap_pool *zswap_pool_create(char *type, char *compressor) { int i; @@ -271,7 +319,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor) if (!pool) return NULL; - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) { + __zswap_set_nr_zpools(pool); + + for (i = 0; i < (1 << pool->nr_zpools_order); i++) { /* unique name for each pool specifically required by zsmalloc */ snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count)); @@ -372,7 +422,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool) cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node); free_percpu(pool->acomp_ctx); - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + for (i = 0; i < (1 << pool->nr_zpools_order); i++) zpool_destroy_pool(pool->zpools[i]); kfree(pool); } @@ -513,7 +563,7 @@ unsigned long zswap_total_pages(void) list_for_each_entry_rcu(pool, &zswap_pools, list) { int i; - for (i = 0; i < ZSWAP_NR_ZPOOLS; i++) + for (i = 0; i < (1 << pool->nr_zpools_order); i++) total += zpool_get_total_pages(pool->zpools[i]); } rcu_read_unlock(); @@ -822,7 +872,10 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) static struct zpool *zswap_find_zpool(struct zswap_entry *entry) { - return entry->pool->zpools[hash_ptr(entry, ilog2(ZSWAP_NR_ZPOOLS))]; + if (entry->pool->nr_zpools_order == 0) + return entry->pool->zpools[0]; + + return entry->pool->zpools[hash_ptr(entry, entry->pool->nr_zpools_order)]; } /* -- 2.43.0