On Tue, Jan 14, 2025 at 2:00 AM Kairui Song <ryncsn@xxxxxxxxx> wrote: > > From: Kairui Song <kasong@xxxxxxxxxxx> > > Non-rotational devices (SSD / ZRAM) can tolerate fragmentation, so the > goal of the SWAP allocator is to avoid contention for clusters. It uses > a per-CPU cluster design, and each CPU will use a different cluster as > much as possible. > > However, HDDs are very sensitive to fragmentation, contention is trivial > in comparison. Therefore, we use one global cluster instead. This ensures > that each order will be written to the same cluster as much as possible, > which helps make the I/O more continuous. > > This ensures that the performance of the cluster allocator is as good as > that of the old allocator. Tests after this commit compared to those > before this series: > > Tested using 'make -j32' with tinyconfig, a 1G memcg limit, and HDD swap: > > make -j32 with tinyconfig, using 1G memcg limit and HDD swap: > > Before this series: > 114.44user 29.11system 39:42.90elapsed 6%CPU (0avgtext+0avgdata 157284maxresident)k > 2901232inputs+0outputs (238877major+4227640minor)pagefaults > > After this commit: > 113.90user 23.81system 38:11.77elapsed 6%CPU (0avgtext+0avgdata 157260maxresident)k > 2548728inputs+0outputs (235471major+4238110minor)pagefaults > > Suggested-by: Chris Li <chrisl@xxxxxxxxxx> > Signed-off-by: Kairui Song <kasong@xxxxxxxxxxx> > --- > include/linux/swap.h | 2 ++ > mm/swapfile.c | 51 ++++++++++++++++++++++++++++++++------------ > 2 files changed, 39 insertions(+), 14 deletions(-) > > diff --git a/include/linux/swap.h b/include/linux/swap.h > index 4c1d2e69689f..b13b72645db3 100644 > --- a/include/linux/swap.h > +++ b/include/linux/swap.h > @@ -318,6 +318,8 @@ struct swap_info_struct { > unsigned int pages; /* total of usable pages of swap */ > atomic_long_t inuse_pages; /* number of those currently in use */ > struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ > + struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */ > + spinlock_t global_cluster_lock; /* Serialize usage of global cluster */ > struct rb_root swap_extent_root;/* root of the swap extent rbtree */ > struct block_device *bdev; /* swap device or bdev of swap file */ > struct file *swap_file; /* seldom referenced */ > diff --git a/mm/swapfile.c b/mm/swapfile.c > index 37d540fa0310..793b2fd1a2a8 100644 > --- a/mm/swapfile.c > +++ b/mm/swapfile.c > @@ -820,7 +820,10 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, > out: > relocate_cluster(si, ci); > unlock_cluster(ci); > - __this_cpu_write(si->percpu_cluster->next[order], next); > + if (si->flags & SWP_SOLIDSTATE) > + __this_cpu_write(si->percpu_cluster->next[order], next); > + else > + si->global_cluster->next[order] = next; > return found; > } > > @@ -881,9 +884,16 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > struct swap_cluster_info *ci; > unsigned int offset, found = 0; > > - /* Fast path using per CPU cluster */ > - local_lock(&si->percpu_cluster->lock); > - offset = __this_cpu_read(si->percpu_cluster->next[order]); > + if (si->flags & SWP_SOLIDSTATE) { > + /* Fast path using per CPU cluster */ > + local_lock(&si->percpu_cluster->lock); > + offset = __this_cpu_read(si->percpu_cluster->next[order]); > + } else { > + /* Serialize HDD SWAP allocation for each device. */ > + spin_lock(&si->global_cluster_lock); > + offset = si->global_cluster->next[order]; > + } > + > if (offset) { > ci = lock_cluster(si, offset); > /* Cluster could have been used by another order */ > @@ -975,8 +985,10 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o > } > } > done: > - local_unlock(&si->percpu_cluster->lock); > - > + if (si->flags & SWP_SOLIDSTATE) > + local_unlock(&si->percpu_cluster->lock); > + else > + spin_unlock(&si->global_cluster_lock); > return found; > } > > @@ -2784,6 +2796,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) > mutex_unlock(&swapon_mutex); > free_percpu(p->percpu_cluster); > p->percpu_cluster = NULL; > + kfree(p->global_cluster); > + p->global_cluster = NULL; > vfree(swap_map); > kvfree(zeromap); > kvfree(cluster_info); > @@ -3189,17 +3203,24 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, > for (i = 0; i < nr_clusters; i++) > spin_lock_init(&cluster_info[i].lock); > > - si->percpu_cluster = alloc_percpu(struct percpu_cluster); > - if (!si->percpu_cluster) > - goto err_free; > + if (si->flags & SWP_SOLIDSTATE) { > + si->percpu_cluster = alloc_percpu(struct percpu_cluster); > + if (!si->percpu_cluster) > + goto err_free; > > - for_each_possible_cpu(cpu) { > - struct percpu_cluster *cluster; > + for_each_possible_cpu(cpu) { > + struct percpu_cluster *cluster; > > - cluster = per_cpu_ptr(si->percpu_cluster, cpu); > + cluster = per_cpu_ptr(si->percpu_cluster, cpu); > + for (i = 0; i < SWAP_NR_ORDERS; i++) > + cluster->next[i] = SWAP_ENTRY_INVALID; > + local_lock_init(&cluster->lock); > + } > + } else { > + si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); Hi Andrew, Sorry I just found I missed this tiny fix here, can you help fold it to the current unstable tree? diff --git a/mm/swapfile.c b/mm/swapfile.c index e57e5453a25b..559b8e62ff71 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3212,6 +3212,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } } else { si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); + if (!si->global_cluster) + goto err_free; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock);