On Tue, Mar 26, 2013 at 01:38:43PM +0800, Shaohua Li wrote: > swap cluster allocation is to get better request merge to improve performance. > But the cluster is shared globally, if multiple tasks are doing swap, this will > cause interleave disk access. While multiple tasks swap is quite common, for > example, each numa node has a kswapd thread doing swap or multiple > threads/processes do direct page reclaim. > > We makes the cluster allocation per-cpu here. The interleave disk access issue > goes away. All tasks will do sequential swap. > > If one CPU can't get its per-cpu cluster, it will fallback to scan swap_map. > The CPU can still continue swap. We don't need recycle free swap entries of > other CPUs. > > In my test (swap to a 2-disk raid0 partition), this improves around 10% > swapout throughput, and request size is increased significantly. > > How does this impact swap readahead is uncertain though. On one side, page > reclaim always isolates and swaps several adjancent pages, this will make page > reclaim write the pages sequentially and benefit readahead. On the other side, > several CPU write pages interleave means the pages don't live _sequentially_ > but relatively _near_. In the per-cpu allocation case, if adjancent pages are > written by different cpus, they will live relatively _far_. So how this > impacts swap readahead depends on how many pages page reclaim isolates and > swaps one time. If the number is big, this patch will benefit swap readahead. > Of course, this is about sequential access pattern. The patch has no impact for > random access pattern, because the new cluster allocation algorithm is just for > SSD. > > Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx> > --- Acked-by: Rafael Aquini <aquini@xxxxxxxxxx> > include/linux/swap.h | 6 ++ > mm/swapfile.c | 110 +++++++++++++++++++++++++++++++++++++-------------- > 2 files changed, 87 insertions(+), 29 deletions(-) > > Index: linux/include/linux/swap.h > =================================================================== > --- linux.orig/include/linux/swap.h 2013-03-22 17:23:56.000000000 +0800 > +++ linux/include/linux/swap.h 2013-03-22 17:44:16.877775720 +0800 > @@ -175,6 +175,11 @@ enum { > #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ > #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ > > +struct percpu_cluster { > + unsigned int index; /* Current cluster index */ > + unsigned int next; /* Likely next allocation offset */ > +}; > + > /* > * The in-memory structure used to track swap areas. > */ > @@ -194,6 +199,7 @@ struct swap_info_struct { > unsigned int inuse_pages; /* number of those currently in use */ > unsigned int cluster_next; /* likely index for next allocation */ > unsigned int cluster_nr; /* countdown to next cluster search */ > + struct percpu_cluster __percpu *percpu_cluster; > struct swap_extent *curr_swap_extent; > struct swap_extent first_swap_extent; > struct block_device *bdev; /* swap device or bdev of swap file */ > Index: linux/mm/swapfile.c > =================================================================== > --- linux.orig/mm/swapfile.c 2013-03-22 17:40:51.000000000 +0800 > +++ linux/mm/swapfile.c 2013-03-26 11:03:09.225915386 +0800 > @@ -353,13 +353,71 @@ static inline void dec_cluster_info_page > * It's possible scan_swap_map() uses a free cluster in the middle of free > * cluster list. Avoiding such abuse to avoid list corruption. > */ > -static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, > +static bool > +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, > unsigned long offset) > { > + struct percpu_cluster *percpu_cluster; > + bool conflict; > + > offset /= SWAPFILE_CLUSTER; > - return si->free_cluster_head != CLUSTER_NULL && > + conflict = si->free_cluster_head != CLUSTER_NULL && > offset != si->free_cluster_head && > cluster_is_free(si->cluster_info[offset]); > + > + if (!conflict) > + return false; > + > + percpu_cluster = this_cpu_ptr(si->percpu_cluster); > + percpu_cluster->index = CLUSTER_NULL; > + return true; > +} > + > +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, > + unsigned long *offset, unsigned long *scan_base) > +{ > + struct percpu_cluster *cluster; > + bool found_free; > + unsigned long tmp; > + > +new_cluster: > + cluster = this_cpu_ptr(si->percpu_cluster); > + if (cluster->index == CLUSTER_NULL) { > + if (si->free_cluster_head != CLUSTER_NULL) { > + cluster->index = si->free_cluster_head; > + cluster->next = cluster->index * SWAPFILE_CLUSTER; > + } else if (si->discard_cluster_head != CLUSTER_NULL) { > + /* > + * we don't have free cluster but have some clusters in > + * discarding, do discard now and reclaim them > + */ > + swap_do_scheduled_discard(si); > + goto new_cluster; > + } else > + return; > + } > + > + found_free = false; > + > + /* > + * Other CPUs can use our cluster if they can't find a free cluster, > + * check if there is still free entry in the cluster > + */ > + tmp = cluster->next; > + while (tmp < si->max && tmp < (cluster->index + 1) * SWAPFILE_CLUSTER) { > + if (!si->swap_map[tmp]) { > + found_free = true; > + break; > + } > + tmp++; > + } > + if (!found_free) { > + cluster->index = CLUSTER_NULL; > + goto new_cluster; > + } > + cluster->next = tmp + 1; > + *offset = tmp; > + *scan_base = tmp; > } > > static unsigned long scan_swap_map(struct swap_info_struct *si, > @@ -384,36 +442,17 @@ static unsigned long scan_swap_map(struc > si->flags += SWP_SCANNING; > scan_base = offset = si->cluster_next; > > + /* SSD algorithm */ > + if (si->cluster_info) { > + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); > + goto checks; > + } > + > if (unlikely(!si->cluster_nr--)) { > if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { > si->cluster_nr = SWAPFILE_CLUSTER - 1; > goto checks; > } > -check_cluster: > - if (si->free_cluster_head != CLUSTER_NULL) { > - offset = si->free_cluster_head * SWAPFILE_CLUSTER; > - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; > - si->cluster_next = offset; > - si->cluster_nr = SWAPFILE_CLUSTER - 1; > - goto checks; > - } else if (si->cluster_info) { > - /* > - * we don't have free cluster but have some clusters in > - * discarding, do discard now and reclaim them > - */ > - if (si->discard_cluster_head != CLUSTER_NULL) { > - swap_do_scheduled_discard(si); > - goto check_cluster; > - } > - > - /* > - * Checking free cluster is fast enough, we can do the > - * check every time > - */ > - si->cluster_nr = 0; > - goto checks; > - } > - > spin_unlock(&si->lock); > > /* > @@ -471,8 +510,11 @@ check_cluster: > } > > checks: > - if (scan_swap_map_recheck_cluster(si, offset)) > - goto check_cluster; > + if (si->cluster_info) { > + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { > + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); > + } > + } > if (!(si->flags & SWP_WRITEOK)) > goto no_page; > if (!si->highest_bit) > @@ -1813,6 +1855,8 @@ SYSCALL_DEFINE1(swapoff, const char __us > spin_unlock(&p->lock); > spin_unlock(&swap_lock); > mutex_unlock(&swapon_mutex); > + free_percpu(p->percpu_cluster); > + p->percpu_cluster = NULL; > vfree(swap_map); > vfree(cluster_info); > vfree(frontswap_map_get(p)); > @@ -2310,6 +2354,12 @@ SYSCALL_DEFINE2(swapon, const char __use > error = -ENOMEM; > goto bad_swap; > } > + /* It's fine to initialize percpu_cluster to 0 */ > + p->percpu_cluster = alloc_percpu(struct percpu_cluster); > + if (!p->percpu_cluster) { > + error = -ENOMEM; > + goto bad_swap; > + } > } > > error = swap_cgroup_swapon(p->type, maxpages); > @@ -2353,6 +2403,8 @@ SYSCALL_DEFINE2(swapon, const char __use > error = 0; > goto out; > bad_swap: > + free_percpu(p->percpu_cluster); > + p->percpu_cluster = NULL; > if (inode && S_ISBLK(inode->i_mode) && p->bdev) { > set_blocksize(p->bdev, p->old_block_size); > blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); > > -- > To unsubscribe, send a message with 'unsubscribe linux-mm' in > the body to majordomo@xxxxxxxxx. For more info on Linux MM, > see: http://www.linux-mm.org/ . > Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a> -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>