Re: [PATCH 1/2] mm: zswap: optimize zswap pool size tracking

Chengming Zhou <chengming.zhou@xxxxxxxxx> · Tue, 12 Mar 2024 12:55:14 +0800

On 2024/3/12 00:12, Johannes Weiner wrote:
> Profiling the munmap() of a zswapped memory region shows 50%(!) of the
> total cycles currently going into updating the zswap_pool_total_size.
> 
> There are three consumers of this counter:
> - store, to enforce the globally configured pool limit
> - meminfo & debugfs, to report the size to the user
> - shrink, to determine the batch size for each cycle
> 
> Instead of aggregating everytime an entry enters or exits the zswap
> pool, aggregate the value from the zpools on-demand:
> 
> - Stores aggregate the counter anyway upon success. Aggregating to
>   check the limit instead is the same amount of work.
> 
> - Meminfo & debugfs might benefit somewhat from a pre-aggregated
>   counter, but aren't exactly hotpaths.
> 
> - Shrinking can aggregate once for every cycle instead of doing it for
>   every freed entry. As the shrinker might work on tens or hundreds of
>   objects per scan cycle, this is a large reduction in aggregations.
> 
> The paths that benefit dramatically are swapin, swapoff, and
> unmaps. There could be millions of pages being processed until
> somebody asks for the pool size again. This eliminates the pool size
> updates from those paths entirely.
> 
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>

Great! This is a clever simplification and optimization.

With the incremental diff, feel free to add:

Reviewed-by: Chengming Zhou <chengming.zhou@xxxxxxxxx>

Thanks.

> ---
>  fs/proc/meminfo.c     |  3 +-
>  include/linux/zswap.h |  2 +-
>  mm/zswap.c            | 98 +++++++++++++++++++++----------------------
>  3 files changed, 49 insertions(+), 54 deletions(-)
> 
> diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
> index 45af9a989d40..245171d9164b 100644
> --- a/fs/proc/meminfo.c
> +++ b/fs/proc/meminfo.c
> @@ -89,8 +89,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
>  	show_val_kb(m, "SwapTotal:      ", i.totalswap);
>  	show_val_kb(m, "SwapFree:       ", i.freeswap);
>  #ifdef CONFIG_ZSWAP
> -	seq_printf(m,  "Zswap:          %8lu kB\n",
> -		   (unsigned long)(zswap_pool_total_size >> 10));
> +	show_val_kb(m, "Zswap:          ", zswap_total_pages());
>  	seq_printf(m,  "Zswapped:       %8lu kB\n",
>  		   (unsigned long)atomic_read(&zswap_stored_pages) <<
>  		   (PAGE_SHIFT - 10));
> diff --git a/include/linux/zswap.h b/include/linux/zswap.h
> index 341aea490070..2a85b941db97 100644
> --- a/include/linux/zswap.h
> +++ b/include/linux/zswap.h
> @@ -7,7 +7,6 @@
>  
>  struct lruvec;
>  
> -extern u64 zswap_pool_total_size;
>  extern atomic_t zswap_stored_pages;
>  
>  #ifdef CONFIG_ZSWAP
> @@ -27,6 +26,7 @@ struct zswap_lruvec_state {
>  	atomic_long_t nr_zswap_protected;
>  };
>  
> +unsigned long zswap_total_pages(void);
>  bool zswap_store(struct folio *folio);
>  bool zswap_load(struct folio *folio);
>  void zswap_invalidate(swp_entry_t swp);
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 9a3237752082..7c39327a7cc2 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -43,8 +43,6 @@
>  /*********************************
>  * statistics
>  **********************************/
> -/* Total bytes used by the compressed storage */
> -u64 zswap_pool_total_size;
>  /* The number of compressed pages currently stored in zswap */
>  atomic_t zswap_stored_pages = ATOMIC_INIT(0);
>  /* The number of same-value filled pages currently stored in zswap */
> @@ -264,45 +262,6 @@ static inline struct zswap_tree *swap_zswap_tree(swp_entry_t swp)
>  	pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,		\
>  		 zpool_get_type((p)->zpools[0]))
>  
> -static bool zswap_is_full(void)
> -{
> -	return totalram_pages() * zswap_max_pool_percent / 100 <
> -			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static bool zswap_can_accept(void)
> -{
> -	return totalram_pages() * zswap_accept_thr_percent / 100 *
> -				zswap_max_pool_percent / 100 >
> -			DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
> -}
> -
> -static u64 get_zswap_pool_size(struct zswap_pool *pool)
> -{
> -	u64 pool_size = 0;
> -	int i;
> -
> -	for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> -		pool_size += zpool_get_total_size(pool->zpools[i]);
> -
> -	return pool_size;
> -}
> -
> -static void zswap_update_total_size(void)
> -{
> -	struct zswap_pool *pool;
> -	u64 total = 0;
> -
> -	rcu_read_lock();
> -
> -	list_for_each_entry_rcu(pool, &zswap_pools, list)
> -		total += get_zswap_pool_size(pool);
> -
> -	rcu_read_unlock();
> -
> -	zswap_pool_total_size = total;
> -}
> -
>  /*********************************
>  * pool functions
>  **********************************/
> @@ -540,6 +499,28 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
>  	return NULL;
>  }
>  
> +static unsigned long zswap_max_pages(void)
> +{
> +	return totalram_pages() * zswap_max_pool_percent / 100;
> +}
> +
> +unsigned long zswap_total_pages(void)
> +{
> +	struct zswap_pool *pool;
> +	u64 total = 0;
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(pool, &zswap_pools, list) {
> +		int i;
> +
> +		for (i = 0; i < ZSWAP_NR_ZPOOLS; i++)
> +			total += zpool_get_total_size(pool->zpools[i]);
> +	}
> +	rcu_read_unlock();
> +
> +	return total >> PAGE_SHIFT;
> +}
> +
>  /*********************************
>  * param callbacks
>  **********************************/
> @@ -912,7 +893,6 @@ static void zswap_entry_free(struct zswap_entry *entry)
>  	}
>  	zswap_entry_cache_free(entry);
>  	atomic_dec(&zswap_stored_pages);
> -	zswap_update_total_size();
>  }
>  
>  /*
> @@ -1317,7 +1297,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
>  	nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED);
>  #else
>  	/* use pool stats instead of memcg stats */
> -	nr_backing = zswap_pool_total_size >> PAGE_SHIFT;
> +	nr_backing = zswap_total_pages();
>  	nr_stored = atomic_read(&zswap_nr_stored);
>  #endif
>  
> @@ -1385,6 +1365,10 @@ static void shrink_worker(struct work_struct *w)
>  {
>  	struct mem_cgroup *memcg;
>  	int ret, failures = 0;
> +	unsigned long thr;
> +
> +	/* Reclaim down to the accept threshold */
> +	thr = zswap_max_pages() * zswap_accept_thr_percent / 100;
>  
>  	/* global reclaim will select cgroup in a round-robin fashion. */
>  	do {
> @@ -1432,10 +1416,9 @@ static void shrink_worker(struct work_struct *w)
>  			break;
>  		if (ret && ++failures == MAX_RECLAIM_RETRIES)
>  			break;
> -
>  resched:
>  		cond_resched();
> -	} while (!zswap_can_accept());
> +	} while (zswap_total_pages() > thr);
>  }
>  
>  static int zswap_is_page_same_filled(void *ptr, unsigned long *value)
> @@ -1476,6 +1459,7 @@ bool zswap_store(struct folio *folio)
>  	struct zswap_entry *entry, *dupentry;
>  	struct obj_cgroup *objcg = NULL;
>  	struct mem_cgroup *memcg = NULL;
> +	unsigned long max_pages, cur_pages;
>  
>  	VM_WARN_ON_ONCE(!folio_test_locked(folio));
>  	VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1487,6 +1471,7 @@ bool zswap_store(struct folio *folio)
>  	if (!zswap_enabled)
>  		goto check_old;
>  
> +	/* Check cgroup limits */
>  	objcg = get_obj_cgroup_from_folio(folio);
>  	if (objcg && !obj_cgroup_may_zswap(objcg)) {
>  		memcg = get_mem_cgroup_from_objcg(objcg);
> @@ -1497,15 +1482,20 @@ bool zswap_store(struct folio *folio)
>  		mem_cgroup_put(memcg);
>  	}
>  
> -	/* reclaim space if needed */
> -	if (zswap_is_full()) {
> +	/* Check global limits */
> +	cur_pages = zswap_total_pages();
> +	max_pages = zswap_max_pages();
> +
> +	if (cur_pages >= max_pages) {
>  		zswap_pool_limit_hit++;
>  		zswap_pool_reached_full = true;
>  		goto shrink;
>  	}
>  
>  	if (zswap_pool_reached_full) {
> -	       if (!zswap_can_accept())
> +		unsigned long thr = max_pages * zswap_accept_thr_percent / 100;
> +
> +		if (cur_pages > thr)
>  			goto shrink;
>  		else
>  			zswap_pool_reached_full = false;
> @@ -1581,7 +1571,6 @@ bool zswap_store(struct folio *folio)
>  
>  	/* update stats */
>  	atomic_inc(&zswap_stored_pages);
> -	zswap_update_total_size();
>  	count_vm_event(ZSWPOUT);
>  
>  	return true;
> @@ -1711,6 +1700,13 @@ void zswap_swapoff(int type)
>  
>  static struct dentry *zswap_debugfs_root;
>  
> +static int debugfs_get_total_size(void *data, u64 *val)
> +{
> +	*val = zswap_total_pages() * PAGE_SIZE;
> +	return 0;
> +}
> +DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu");
> +
>  static int zswap_debugfs_init(void)
>  {
>  	if (!debugfs_initialized())
> @@ -1732,8 +1728,8 @@ static int zswap_debugfs_init(void)
>  			   zswap_debugfs_root, &zswap_reject_compress_poor);
>  	debugfs_create_u64("written_back_pages", 0444,
>  			   zswap_debugfs_root, &zswap_written_back_pages);
> -	debugfs_create_u64("pool_total_size", 0444,
> -			   zswap_debugfs_root, &zswap_pool_total_size);
> +	debugfs_create_file("pool_total_size", 0444,
> +			    zswap_debugfs_root, NULL, &total_size_fops);
>  	debugfs_create_atomic_t("stored_pages", 0444,
>  				zswap_debugfs_root, &zswap_stored_pages);
>  	debugfs_create_atomic_t("same_filled_pages", 0444,