Re: [PATCH v2 5/7] mm, swap: use percpu cluster as allocation fast path

Baoquan He <bhe@xxxxxxxxxx> · Tue, 25 Feb 2025 12:01:32 +0800

On 02/25/25 at 02:02am, Kairui Song wrote:
......snip...
> @@ -1204,19 +1252,36 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  	int n_ret = 0;
>  	int node;
>  
> +	/* Fast path using percpu cluster */
> +	local_lock(&percpu_swap_cluster.lock);
> +	n_ret = swap_alloc_fast(swp_entries,
> +				SWAP_HAS_CACHE,
> +				order, n_goal);

Since we always pass in the SWAP_HAS_CACHE for the 2nd parameter, we can
hard code it inside fucntion swap_alloc_fast(), unless there will be
other callsite where different value is passed in for 'usage'. And the
function 2nd parameter of swap_alloc_slow() too. Anyway, it's not a big
deal, could be my personal peference.

Other than this nit, this looks good to me.

> +	if (n_ret == n_goal)
> +		goto out;
> +
> +	n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
> +	/* Rotate the device and switch to a new cluster */
>  	spin_lock(&swap_avail_lock);
>  start_over:
>  	node = numa_node_id();
>  	plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
> -		/* requeue si to after same-priority siblings */
>  		plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
>  		spin_unlock(&swap_avail_lock);
>  		if (get_swap_device_info(si)) {
> -			n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
> -					n_goal, swp_entries, order);
> +			/*
> +			 * For order 0 allocation, try best to fill the request
> +			 * as it's used by slot cache.
> +			 *
> +			 * For mTHP allocation, it always have n_goal == 1,
> +			 * and falling a mTHP swapin will just make the caller
> +			 * fallback to order 0 allocation, so just bail out.
> +			 */
> +			n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
> +					swp_entries + n_ret, order);
>  			put_swap_device(si);
>  			if (n_ret || size > 1)
> -				goto check_out;
> +				goto out;
>  		}
>  
>  		spin_lock(&swap_avail_lock);
> @@ -1234,12 +1299,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
>  		if (plist_node_empty(&next->avail_lists[node]))
>  			goto start_over;
>  	}
> -
>  	spin_unlock(&swap_avail_lock);
> -
> -check_out:
> +out:
> +	local_unlock(&percpu_swap_cluster.lock);
>  	atomic_long_sub(n_ret * size, &nr_swap_pages);
> -
>  	return n_ret;
>  }
>  
> @@ -2725,8 +2788,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
>  	arch_swap_invalidate_area(p->type);
>  	zswap_swapoff(p->type);
>  	mutex_unlock(&swapon_mutex);
> -	free_percpu(p->percpu_cluster);
> -	p->percpu_cluster = NULL;
>  	kfree(p->global_cluster);
>  	p->global_cluster = NULL;
>  	vfree(swap_map);
> @@ -3125,7 +3186,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>  	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
>  	struct swap_cluster_info *cluster_info;
>  	unsigned long i, j, idx;
> -	int cpu, err = -ENOMEM;
> +	int err = -ENOMEM;
>  
>  	cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
>  	if (!cluster_info)
> @@ -3134,20 +3195,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
>  	for (i = 0; i < nr_clusters; i++)
>  		spin_lock_init(&cluster_info[i].lock);
>  
> -	if (si->flags & SWP_SOLIDSTATE) {
> -		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
> -		if (!si->percpu_cluster)
> -			goto err_free;
> -
> -		for_each_possible_cpu(cpu) {
> -			struct percpu_cluster *cluster;
> -
> -			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
> -			for (i = 0; i < SWAP_NR_ORDERS; i++)
> -				cluster->next[i] = SWAP_ENTRY_INVALID;
> -			local_lock_init(&cluster->lock);
> -		}
> -	} else {
> +	if (!(si->flags & SWP_SOLIDSTATE)) {
>  		si->global_cluster = kmalloc(sizeof(*si->global_cluster),
>  				     GFP_KERNEL);
>  		if (!si->global_cluster)
> @@ -3424,8 +3472,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
>  bad_swap_unlock_inode:
>  	inode_unlock(inode);
>  bad_swap:
> -	free_percpu(si->percpu_cluster);
> -	si->percpu_cluster = NULL;
>  	kfree(si->global_cluster);
>  	si->global_cluster = NULL;
>  	inode = NULL;
> -- 
> 2.48.1
>