Re: [PATCH v3 09/13] mm, swap: reduce contention on device lock

Baoquan He <bhe@xxxxxxxxxx> · Wed, 8 Jan 2025 19:09:36 +0800

On 12/31/24 at 01:46am, Kairui Song wrote:
......snip.....
> ---
>  include/linux/swap.h |   3 +-
>  mm/swapfile.c        | 435 ++++++++++++++++++++++++-------------------
>  2 files changed, 246 insertions(+), 192 deletions(-)
> 
> diff --git a/include/linux/swap.h b/include/linux/swap.h
> index 339d7f0192ff..c4ff31cb6bde 100644
> --- a/include/linux/swap.h
> +++ b/include/linux/swap.h
> @@ -291,6 +291,7 @@ enum swap_cluster_flags {
>   * throughput.
>   */
>  struct percpu_cluster {
> +	local_lock_t lock; /* Protect the percpu_cluster above */
>  	unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
>  };
>  
> @@ -313,7 +314,7 @@ struct swap_info_struct {
>  					/* list of cluster that contains at least one free slot */
>  	struct list_head frag_clusters[SWAP_NR_ORDERS];
>  					/* list of cluster that are fragmented or contented */
> -	unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
> +	atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
>  	unsigned int pages;		/* total of usable pages of swap */
>  	atomic_long_t inuse_pages;	/* number of those currently in use */
>  	struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
> diff --git a/mm/swapfile.c b/mm/swapfile.c
> index 7795a3d27273..dadd4fead689 100644
> --- a/mm/swapfile.c
> +++ b/mm/swapfile.c
> @@ -261,12 +261,10 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
>  	folio_ref_sub(folio, nr_pages);
>  	folio_set_dirty(folio);
>  
> -	spin_lock(&si->lock);
>  	/* Only sinple page folio can be backed by zswap */
>  	if (nr_pages == 1)
>  		zswap_invalidate(entry);
>  	swap_entry_range_free(si, entry, nr_pages);
> -	spin_unlock(&si->lock);
>  	ret = nr_pages;
>  out_unlock:
>  	folio_unlock(folio);
> @@ -403,7 +401,21 @@ static void discard_swap_cluster(struct swap_info_struct *si,
>  
>  static inline bool cluster_is_free(struct swap_cluster_info *info)
>  {
> -	return info->flags == CLUSTER_FLAG_FREE;
> +	return info->count == 0;

This is a little confusing. Maybe we should add one and call it
cluster_is_empty(). Because discarded clusters are also be able to pass
the checking here.

> +}
> +
> +static inline bool cluster_is_discard(struct swap_cluster_info *info)
> +{
> +	return info->flags == CLUSTER_FLAG_DISCARD;
> +}
> +
> +static inline bool cluster_is_usable(struct swap_cluster_info *ci, int order)
> +{
> +	if (unlikely(ci->flags > CLUSTER_FLAG_USABLE))
> +		return false;
> +	if (!order)
> +		return true;
> +	return cluster_is_free(ci) || order == ci->order;
>  }
>  
>  static inline unsigned int cluster_index(struct swap_info_struct *si,
> @@ -440,19 +452,20 @@ static void cluster_move(struct swap_info_struct *si,
>  {
>  	VM_WARN_ON(ci->flags == new_flags);
>  	BUILD_BUG_ON(1 << sizeof(ci->flags) * BITS_PER_BYTE < CLUSTER_FLAG_MAX);
> +	lockdep_assert_held(&ci->lock);
>  
> -	if (ci->flags == CLUSTER_FLAG_NONE) {
> +	spin_lock(&si->lock);
> +	if (ci->flags == CLUSTER_FLAG_NONE)
>  		list_add_tail(&ci->list, list);
> -	} else {
> -		if (ci->flags == CLUSTER_FLAG_FRAG) {
> -			VM_WARN_ON(!si->frag_cluster_nr[ci->order]);
> -			si->frag_cluster_nr[ci->order]--;
> -		}
> +	else
>  		list_move_tail(&ci->list, list);
> -	}
> +	spin_unlock(&si->lock);
> +
> +	if (ci->flags == CLUSTER_FLAG_FRAG)
> +		atomic_long_dec(&si->frag_cluster_nr[ci->order]);
> +	else if (new_flags == CLUSTER_FLAG_FRAG)
> +		atomic_long_inc(&si->frag_cluster_nr[ci->order]);
>  	ci->flags = new_flags;
> -	if (new_flags == CLUSTER_FLAG_FRAG)
> -		si->frag_cluster_nr[ci->order]++;
>  }
>  
>  /* Add a cluster to discard list and schedule it to do discard */
> @@ -475,39 +488,90 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
>  
>  static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
>  {
> -	lockdep_assert_held(&si->lock);
>  	lockdep_assert_held(&ci->lock);
>  	cluster_move(si, ci, &si->free_clusters, CLUSTER_FLAG_FREE);
>  	ci->order = 0;
>  }
>  
> +/*
> + * Isolate and lock the first cluster that is not contented on a list,
> + * clean its flag before taken off-list. Cluster flag must be in sync
> + * with list status, so cluster updaters can always know the cluster
> + * list status without touching si lock.
> + *
> + * Note it's possible that all clusters on a list are contented so
> + * this returns NULL for an non-empty list.
> + */
> +static struct swap_cluster_info *cluster_isolate_lock(
> +		struct swap_info_struct *si, struct list_head *list)
> +{
> +	struct swap_cluster_info *ci, *ret = NULL;
> +
> +	spin_lock(&si->lock);
> +
> +	if (unlikely(!(si->flags & SWP_WRITEOK)))
> +		goto out;
> +
> +	list_for_each_entry(ci, list, list) {
> +		if (!spin_trylock(&ci->lock))
> +			continue;
> +
> +		/* We may only isolate and clear flags of following lists */
> +		VM_BUG_ON(!ci->flags);
> +		VM_BUG_ON(ci->flags > CLUSTER_FLAG_USABLE &&
> +			  ci->flags != CLUSTER_FLAG_FULL);
> +
> +		list_del(&ci->list);
> +		ci->flags = CLUSTER_FLAG_NONE;
> +		ret = ci;
> +		break;
> +	}
> +out:
> +	spin_unlock(&si->lock);
> +
> +	return ret;
> +}
> +
>  /*
>   * Doing discard actually. After a cluster discard is finished, the cluster
> - * will be added to free cluster list. caller should hold si->lock.
> -*/
> -static void swap_do_scheduled_discard(struct swap_info_struct *si)
> + * will be added to free cluster list. Discard cluster is a bit special as
> + * they don't participate in allocation or reclaim, so clusters marked as
> + * CLUSTER_FLAG_DISCARD must remain off-list or on discard list.
> + */
> +static bool swap_do_scheduled_discard(struct swap_info_struct *si)
>  {
>  	struct swap_cluster_info *ci;
> +	bool ret = false;
>  	unsigned int idx;
>  
> +	spin_lock(&si->lock);
>  	while (!list_empty(&si->discard_clusters)) {
>  		ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
> +		/*
> +		 * Delete the cluster from list but don't clear its flags until
> +		 * discard is done, so isolation and relocation will skip it.
> +		 */
>  		list_del(&ci->list);

I don't understand above comment. ci has been taken off list. While
allocation need isolate from a usable list. Even though we clear
ci->flags now, how come isolation and relocation will touch it. I may
miss anything here.

> -		/* Must clear flag when taking a cluster off-list */
> -		ci->flags = CLUSTER_FLAG_NONE;
>  		idx = cluster_index(si, ci);
>  		spin_unlock(&si->lock);
> -
>  		discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
>  				SWAPFILE_CLUSTER);
>  
> -		spin_lock(&si->lock);
>  		spin_lock(&ci->lock);
> -		__free_cluster(si, ci);
> +		/*
> +		 * Discard is done, clear its flags as it's now off-list,
> +		 * then return the cluster to allocation list.
> +		 */
> +		ci->flags = CLUSTER_FLAG_NONE;
>  		memset(si->swap_map + idx * SWAPFILE_CLUSTER,
>  				0, SWAPFILE_CLUSTER);
> +		__free_cluster(si, ci);
>  		spin_unlock(&ci->lock);
> +		ret = true;
> +		spin_lock(&si->lock);
>  	}
> +	spin_unlock(&si->lock);
> +	return ret;
>  }
>  
>  static void swap_discard_work(struct work_struct *work)
......snip....
> @@ -791,29 +873,34 @@ static void swap_reclaim_work(struct work_struct *work)
>  static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
>  					      unsigned char usage)
>  {
> -	struct percpu_cluster *cluster;
>  	struct swap_cluster_info *ci;
>  	unsigned int offset, found = 0;
>  
> -new_cluster:
> -	lockdep_assert_held(&si->lock);
> -	cluster = this_cpu_ptr(si->percpu_cluster);
> -	offset = cluster->next[order];
> +	/* Fast path using per CPU cluster */
> +	local_lock(&si->percpu_cluster->lock);
> +	offset = __this_cpu_read(si->percpu_cluster->next[order]);
>  	if (offset) {
> -		offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
> +		ci = lock_cluster(si, offset);
> +		/* Cluster could have been used by another order */
> +		if (cluster_is_usable(ci, order)) {
> +			if (cluster_is_free(ci))
> +				offset = cluster_offset(si, ci);
> +			offset = alloc_swap_scan_cluster(si, offset, &found,
> +							 order, usage);
> +		} else {
> +			unlock_cluster(ci);
> +		}
>  		if (found)
>  			goto done;
>  	}
>  
> -	if (!list_empty(&si->free_clusters)) {
> -		ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
> -		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
> -		/*
> -		 * Either we didn't touch the cluster due to swapoff,
> -		 * or the allocation must success.
> -		 */
> -		VM_BUG_ON((si->flags & SWP_WRITEOK) && !found);
> -		goto done;
> +new_cluster:
> +	ci = cluster_isolate_lock(si, &si->free_clusters);
> +	if (ci) {
> +		offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
> +						 &found, order, usage);
> +		if (found)
> +			goto done;
>  	}
>  
>  	/* Try reclaim from full clusters if free clusters list is drained */
> @@ -821,49 +908,45 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
>  		swap_reclaim_full_clusters(si, false);
>  
>  	if (order < PMD_ORDER) {
> -		unsigned int frags = 0;
> +		unsigned int frags = 0, frags_existing;
>  
> -		while (!list_empty(&si->nonfull_clusters[order])) {
> -			ci = list_first_entry(&si->nonfull_clusters[order],
> -					      struct swap_cluster_info, list);
> -			cluster_move(si, ci, &si->frag_clusters[order], CLUSTER_FLAG_FRAG);
> +		while ((ci = cluster_isolate_lock(si, &si->nonfull_clusters[order]))) {
>  			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
>  							 &found, order, usage);
> -			frags++;
> +			/*
> +			 * With `fragmenting` set to true, it will surely take
                                 ~~~~~~~~~~~
                         wondering what 'fragmenting' means here.

> +			 * the cluster off nonfull list
> +			 */
>  			if (found)
>  				goto done;
> +			frags++;
>  		}
>  
> -		/*
> -		 * Nonfull clusters are moved to frag tail if we reached
> -		 * here, count them too, don't over scan the frag list.
> -		 */
> -		while (frags < si->frag_cluster_nr[order]) {
> -			ci = list_first_entry(&si->frag_clusters[order],
> -					      struct swap_cluster_info, list);
> +		frags_existing = atomic_long_read(&si->frag_cluster_nr[order]);
> +		while (frags < frags_existing &&
> +		       (ci = cluster_isolate_lock(si, &si->frag_clusters[order]))) {
> +			atomic_long_dec(&si->frag_cluster_nr[order]);
>  			/*
> -			 * Rotate the frag list to iterate, they were all failing
> -			 * high order allocation or moved here due to per-CPU usage,
> -			 * this help keeping usable cluster ahead.
> +			 * Rotate the frag list to iterate, they were all
> +			 * failing high order allocation or moved here due to
> +			 * per-CPU usage, but they could contain newly released
> +			 * reclaimable (eg. lazy-freed swap cache) slots.
>  			 */
> -			list_move_tail(&ci->list, &si->frag_clusters[order]);
>  			offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
>  							 &found, order, usage);
> -			frags++;
>  			if (found)
>  				goto done;
> +			frags++;
>  		}
>  	}
>  
> -	if (!list_empty(&si->discard_clusters)) {
> -		/*
> -		 * we don't have free cluster but have some clusters in
> -		 * discarding, do discard now and reclaim them, then
> -		 * reread cluster_next_cpu since we dropped si->lock
> -		 */
> -		swap_do_scheduled_discard(si);
> +	/*
> +	 * We don't have free cluster but have some clusters in
> +	 * discarding, do discard now and reclaim them, then
> +	 * reread cluster_next_cpu since we dropped si->lock
> +	 */
> +	if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si))
>  		goto new_cluster;
> -	}
>  
>  	if (order)
>  		goto done;
.....