swap cluster allocation is to get better request merge to improve performance. But the cluster is shared globally, if multiple tasks are doing swap, this will cause interleave disk access. While multiple tasks swap is quite common, for example, each numa node has a kswapd thread doing swap or multiple threads/processes do direct page reclaim. We makes the cluster allocation per-cpu here. The interleave disk access issue goes away. All tasks will do sequential swap. If one CPU can't get its per-cpu cluster, it will fallback to scan swap_map. The CPU can still continue swap. We don't need recycle free swap entries of other CPUs. In my test (swap to a 2-disk raid0 partition), this improves around 10% swapout throughput, and request size is increased significantly. How does this impact swap readahead is uncertain though. On one side, page reclaim always isolates and swaps several adjancent pages, this will make page reclaim write the pages sequentially and benefit readahead. On the other side, several CPU write pages interleave means the pages don't live _sequentially_ but relatively _near_. In the per-cpu allocation case, if adjancent pages are written by different cpus, they will live relatively _far_. So how this impacts swap readahead depends on how many pages page reclaim isolates and swaps one time. If the number is big, this patch will benefit swap readahead. Of course, this is about sequential access pattern. The patch has no impact for random access pattern, because the new cluster allocation algorithm is just for SSD. Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx> --- include/linux/swap.h | 6 ++ mm/swapfile.c | 110 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 87 insertions(+), 29 deletions(-) Index: linux/include/linux/swap.h =================================================================== --- linux.orig/include/linux/swap.h 2013-03-22 17:23:56.000000000 +0800 +++ linux/include/linux/swap.h 2013-03-22 17:44:16.877775720 +0800 @@ -175,6 +175,11 @@ enum { #define COUNT_CONTINUED 0x80 /* See swap_map continuation for full count */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs, in first swap_map */ +struct percpu_cluster { + unsigned int index; /* Current cluster index */ + unsigned int next; /* Likely next allocation offset */ +}; + /* * The in-memory structure used to track swap areas. */ @@ -194,6 +199,7 @@ struct swap_info_struct { unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ + struct percpu_cluster __percpu *percpu_cluster; struct swap_extent *curr_swap_extent; struct swap_extent first_swap_extent; struct block_device *bdev; /* swap device or bdev of swap file */ Index: linux/mm/swapfile.c =================================================================== --- linux.orig/mm/swapfile.c 2013-03-22 17:40:51.000000000 +0800 +++ linux/mm/swapfile.c 2013-03-26 11:03:09.225915386 +0800 @@ -353,13 +353,71 @@ static inline void dec_cluster_info_page * It's possible scan_swap_map() uses a free cluster in the middle of free * cluster list. Avoiding such abuse to avoid list corruption. */ -static inline bool scan_swap_map_recheck_cluster(struct swap_info_struct *si, +static bool +scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si, unsigned long offset) { + struct percpu_cluster *percpu_cluster; + bool conflict; + offset /= SWAPFILE_CLUSTER; - return si->free_cluster_head != CLUSTER_NULL && + conflict = si->free_cluster_head != CLUSTER_NULL && offset != si->free_cluster_head && cluster_is_free(si->cluster_info[offset]); + + if (!conflict) + return false; + + percpu_cluster = this_cpu_ptr(si->percpu_cluster); + percpu_cluster->index = CLUSTER_NULL; + return true; +} + +static void scan_swap_map_try_ssd_cluster(struct swap_info_struct *si, + unsigned long *offset, unsigned long *scan_base) +{ + struct percpu_cluster *cluster; + bool found_free; + unsigned long tmp; + +new_cluster: + cluster = this_cpu_ptr(si->percpu_cluster); + if (cluster->index == CLUSTER_NULL) { + if (si->free_cluster_head != CLUSTER_NULL) { + cluster->index = si->free_cluster_head; + cluster->next = cluster->index * SWAPFILE_CLUSTER; + } else if (si->discard_cluster_head != CLUSTER_NULL) { + /* + * we don't have free cluster but have some clusters in + * discarding, do discard now and reclaim them + */ + swap_do_scheduled_discard(si); + goto new_cluster; + } else + return; + } + + found_free = false; + + /* + * Other CPUs can use our cluster if they can't find a free cluster, + * check if there is still free entry in the cluster + */ + tmp = cluster->next; + while (tmp < si->max && tmp < (cluster->index + 1) * SWAPFILE_CLUSTER) { + if (!si->swap_map[tmp]) { + found_free = true; + break; + } + tmp++; + } + if (!found_free) { + cluster->index = CLUSTER_NULL; + goto new_cluster; + } + cluster->next = tmp + 1; + *offset = tmp; + *scan_base = tmp; } static unsigned long scan_swap_map(struct swap_info_struct *si, @@ -384,36 +442,17 @@ static unsigned long scan_swap_map(struc si->flags += SWP_SCANNING; scan_base = offset = si->cluster_next; + /* SSD algorithm */ + if (si->cluster_info) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + goto checks; + } + if (unlikely(!si->cluster_nr--)) { if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) { si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } -check_cluster: - if (si->free_cluster_head != CLUSTER_NULL) { - offset = si->free_cluster_head * SWAPFILE_CLUSTER; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } else if (si->cluster_info) { - /* - * we don't have free cluster but have some clusters in - * discarding, do discard now and reclaim them - */ - if (si->discard_cluster_head != CLUSTER_NULL) { - swap_do_scheduled_discard(si); - goto check_cluster; - } - - /* - * Checking free cluster is fast enough, we can do the - * check every time - */ - si->cluster_nr = 0; - goto checks; - } - spin_unlock(&si->lock); /* @@ -471,8 +510,11 @@ check_cluster: } checks: - if (scan_swap_map_recheck_cluster(si, offset)) - goto check_cluster; + if (si->cluster_info) { + while (scan_swap_map_ssd_cluster_conflict(si, offset)) { + scan_swap_map_try_ssd_cluster(si, &offset, &scan_base); + } + } if (!(si->flags & SWP_WRITEOK)) goto no_page; if (!si->highest_bit) @@ -1813,6 +1855,8 @@ SYSCALL_DEFINE1(swapoff, const char __us spin_unlock(&p->lock); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; vfree(swap_map); vfree(cluster_info); vfree(frontswap_map_get(p)); @@ -2310,6 +2354,12 @@ SYSCALL_DEFINE2(swapon, const char __use error = -ENOMEM; goto bad_swap; } + /* It's fine to initialize percpu_cluster to 0 */ + p->percpu_cluster = alloc_percpu(struct percpu_cluster); + if (!p->percpu_cluster) { + error = -ENOMEM; + goto bad_swap; + } } error = swap_cgroup_swapon(p->type, maxpages); @@ -2353,6 +2403,8 @@ SYSCALL_DEFINE2(swapon, const char __use error = 0; goto out; bad_swap: + free_percpu(p->percpu_cluster); + p->percpu_cluster = NULL; if (inode && S_ISBLK(inode->i_mode) && p->bdev) { set_blocksize(p->bdev, p->old_block_size); blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>