+ mm-swapfile-fix-ssd-detection-with-swapfile-on-btrfs.patch added to mm-unstable branch

Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> · Thu, 22 Aug 2024 11:26:54 -0700

The patch titled
     Subject: mm: swapfile: fix SSD detection with swapfile on btrfs
has been added to the -mm mm-unstable branch.  Its filename is
     mm-swapfile-fix-ssd-detection-with-swapfile-on-btrfs.patch

This patch will shortly appear at
     https://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new.git/tree/patches/mm-swapfile-fix-ssd-detection-with-swapfile-on-btrfs.patch

This patch will later appear in the mm-unstable branch at
    git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm

Before you just go and hit "reply", please:
   a) Consider who else should be cc'ed
   b) Prefer to cc a suitable mailing list as well
   c) Ideally: find the original patch on the mailing list and do a
      reply-to-all to that, adding suitable additional cc's

*** Remember to use Documentation/process/submit-checklist.rst when testing your code ***

The -mm tree is included into linux-next via the mm-everything
branch at git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
and is updated there every 2-3 working days

------------------------------------------------------
From: Johannes Weiner <hannes@xxxxxxxxxxx>
Subject: mm: swapfile: fix SSD detection with swapfile on btrfs
Date: Thu, 22 Aug 2024 13:24:58 +0200

We've been noticing a trend of significant lock contention in the swap
subsystem as core counts have been increasing in our fleet.  It turns out
that our swapfiles on btrfs on flash were in fact using the old swap code
for rotational storage.

This turns out to be a detection issue in the swapon sequence: btrfs sets
si->bdev during swap activation, which currently happens *after* swapon's
SSD detection and cluster setup.  Thus, none of the SSD optimizations and
cluster lock splitting are enabled for btrfs swap.

Rearrange the swapon sequence so that filesystem activation happens
*before* determining swap behavior based on the backing device.

Afterwards, the nonrotational drive is detected correctly:

- Adding 2097148k swap on /mnt/swapfile.  Priority:-3 extents:1 across:2097148k
+ Adding 2097148k swap on /mnt/swapfile.  Priority:-3 extents:1 across:2097148k SS

Link: https://lkml.kernel.org/r/20240822112707.351844-1-hannes@xxxxxxxxxxx
Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
Cc: "Huang, Ying" <ying.huang@xxxxxxxxx>
Cc: Hugh Dickins <hughd@xxxxxxxxxx>
Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
---

 mm/swapfile.c |  165 +++++++++++++++++++++++++-----------------------
 1 file changed, 86 insertions(+), 79 deletions(-)

--- a/mm/swapfile.c~mm-swapfile-fix-ssd-detection-with-swapfile-on-btrfs
+++ a/mm/swapfile.c
@@ -3196,29 +3196,15 @@ static unsigned long read_swap_header(st
 static int setup_swap_map_and_extents(struct swap_info_struct *si,
 					union swap_header *swap_header,
 					unsigned char *swap_map,
-					struct swap_cluster_info *cluster_info,
 					unsigned long maxpages,
 					sector_t *span)
 {
-	unsigned int j, k;
 	unsigned int nr_good_pages;
+	unsigned long i;
 	int nr_extents;
-	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
-	unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
-	unsigned long i, idx;
 
 	nr_good_pages = maxpages - 1;	/* omit header page */
 
-	INIT_LIST_HEAD(&si->free_clusters);
-	INIT_LIST_HEAD(&si->full_clusters);
-	INIT_LIST_HEAD(&si->discard_clusters);
-
-	for (i = 0; i < SWAP_NR_ORDERS; i++) {
-		INIT_LIST_HEAD(&si->nonfull_clusters[i]);
-		INIT_LIST_HEAD(&si->frag_clusters[i]);
-		si->frag_cluster_nr[i] = 0;
-	}
-
 	for (i = 0; i < swap_header->info.nr_badpages; i++) {
 		unsigned int page_nr = swap_header->info.badpages[i];
 		if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -3226,25 +3212,11 @@ static int setup_swap_map_and_extents(st
 		if (page_nr < maxpages) {
 			swap_map[page_nr] = SWAP_MAP_BAD;
 			nr_good_pages--;
-			/*
-			 * Haven't marked the cluster free yet, no list
-			 * operation involved
-			 */
-			inc_cluster_info_page(si, cluster_info, page_nr);
 		}
 	}
 
-	/* Haven't marked the cluster free yet, no list operation involved */
-	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
-		inc_cluster_info_page(si, cluster_info, i);
-
 	if (nr_good_pages) {
 		swap_map[0] = SWAP_MAP_BAD;
-		/*
-		 * Not mark the cluster free yet, no list
-		 * operation involved
-		 */
-		inc_cluster_info_page(si, cluster_info, 0);
 		si->max = maxpages;
 		si->pages = nr_good_pages;
 		nr_extents = setup_swap_extents(si, span);
@@ -3257,8 +3229,70 @@ static int setup_swap_map_and_extents(st
 		return -EINVAL;
 	}
 
+	return nr_extents;
+}
+
+static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
+						union swap_header *swap_header,
+						unsigned long maxpages)
+{
+	unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+	unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+	struct swap_cluster_info *cluster_info;
+	unsigned long i, j, k, idx;
+	int cpu, err = -ENOMEM;
+
+	cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
 	if (!cluster_info)
-		return nr_extents;
+		goto err;
+
+	for (i = 0; i < nr_clusters; i++)
+		spin_lock_init(&cluster_info[i].lock);
+
+	si->cluster_next_cpu = alloc_percpu(unsigned int);
+	if (!si->cluster_next_cpu)
+		goto err_free;
+
+	/* Random start position to help with wear leveling */
+	for_each_possible_cpu(cpu)
+		per_cpu(*si->cluster_next_cpu, cpu) =
+		get_random_u32_inclusive(1, si->highest_bit);
+
+	si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+	if (!si->percpu_cluster)
+		goto err_free;
+
+	for_each_possible_cpu(cpu) {
+		struct percpu_cluster *cluster;
+
+		cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+		for (i = 0; i < SWAP_NR_ORDERS; i++)
+			cluster->next[i] = SWAP_NEXT_INVALID;
+	}
+
+	/*
+	 * Mark unusable pages as unavailable. The clusters aren't
+	 * marked free yet, so no list operations are involved yet.
+	 *
+	 * See setup_swap_map_and_extents(): header page, bad pages,
+	 * and the EOF part of the last cluster.
+	 */
+	inc_cluster_info_page(si, cluster_info, 0);
+	for (i = 0; i < swap_header->info.nr_badpages; i++)
+		inc_cluster_info_page(si, cluster_info,
+				      swap_header->info.badpages[i]);
+	for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+		inc_cluster_info_page(si, cluster_info, i);
+
+	INIT_LIST_HEAD(&si->free_clusters);
+	INIT_LIST_HEAD(&si->full_clusters);
+	INIT_LIST_HEAD(&si->discard_clusters);
+
+	for (i = 0; i < SWAP_NR_ORDERS; i++) {
+		INIT_LIST_HEAD(&si->nonfull_clusters[i]);
+		INIT_LIST_HEAD(&si->frag_clusters[i]);
+		si->frag_cluster_nr[i] = 0;
+	}
 
 	/*
 	 * Reduce false cache line sharing between cluster_info and
@@ -3281,7 +3315,13 @@ static int setup_swap_map_and_extents(st
 			list_add_tail(&ci->list, &si->free_clusters);
 		}
 	}
-	return nr_extents;
+
+	return cluster_info;
+
+err_free:
+	kvfree(cluster_info);
+err:
+	return ERR_PTR(err);
 }
 
 SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
@@ -3377,6 +3417,17 @@ SYSCALL_DEFINE2(swapon, const char __use
 		goto bad_swap_unlock_inode;
 	}
 
+	error = swap_cgroup_swapon(si->type, maxpages);
+	if (error)
+		goto bad_swap_unlock_inode;
+
+	nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
+						maxpages, &span);
+	if (unlikely(nr_extents < 0)) {
+		error = nr_extents;
+		goto bad_swap_unlock_inode;
+	}
+
 	if (si->bdev && bdev_stable_writes(si->bdev))
 		si->flags |= SWP_STABLE_WRITES;
 
@@ -3384,63 +3435,19 @@ SYSCALL_DEFINE2(swapon, const char __use
 		si->flags |= SWP_SYNCHRONOUS_IO;
 
 	if (si->bdev && bdev_nonrot(si->bdev)) {
-		int cpu, i;
-		unsigned long ci, nr_cluster;
-
 		si->flags |= SWP_SOLIDSTATE;
-		si->cluster_next_cpu = alloc_percpu(unsigned int);
-		if (!si->cluster_next_cpu) {
-			error = -ENOMEM;
-			goto bad_swap_unlock_inode;
-		}
-		/*
-		 * select a random position to start with to help wear leveling
-		 * SSD
-		 */
-		for_each_possible_cpu(cpu) {
-			per_cpu(*si->cluster_next_cpu, cpu) =
-				get_random_u32_inclusive(1, si->highest_bit);
-		}
-		nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
 
-		cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
-					GFP_KERNEL);
-		if (!cluster_info) {
-			error = -ENOMEM;
+		cluster_info = setup_clusters(si, swap_header, maxpages);
+		if (IS_ERR(cluster_info)) {
+			error = PTR_ERR(cluster_info);
+			cluster_info = NULL;
 			goto bad_swap_unlock_inode;
 		}
-
-		for (ci = 0; ci < nr_cluster; ci++)
-			spin_lock_init(&((cluster_info + ci)->lock));
-
-		si->percpu_cluster = alloc_percpu(struct percpu_cluster);
-		if (!si->percpu_cluster) {
-			error = -ENOMEM;
-			goto bad_swap_unlock_inode;
-		}
-		for_each_possible_cpu(cpu) {
-			struct percpu_cluster *cluster;
-
-			cluster = per_cpu_ptr(si->percpu_cluster, cpu);
-			for (i = 0; i < SWAP_NR_ORDERS; i++)
-				cluster->next[i] = SWAP_NEXT_INVALID;
-		}
 	} else {
 		atomic_inc(&nr_rotate_swap);
 		inced_nr_rotate_swap = true;
 	}
 
-	error = swap_cgroup_swapon(si->type, maxpages);
-	if (error)
-		goto bad_swap_unlock_inode;
-
-	nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
-		cluster_info, maxpages, &span);
-	if (unlikely(nr_extents < 0)) {
-		error = nr_extents;
-		goto bad_swap_unlock_inode;
-	}
-
 	if ((swap_flags & SWAP_FLAG_DISCARD) &&
 	    si->bdev && bdev_max_discard_sectors(si->bdev)) {
 		/*
_

Patches currently in -mm which might be from hannes@xxxxxxxxxxx are

mm-swapfile-fix-ssd-detection-with-swapfile-on-btrfs.patch