From: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> Subject: [PATCH V9 2/6] mm: frontswap: core swap subsystem hooks and headers (Note to earlier reviewers: This patchset has been reorganized due to feedback from Kame Hiroyuki and Andrew Morton. This patch combines patch 1of4 and patch 3of4 from the previous series.) This second patch of six in the frontswap series contains the changes to the core swap subsystem. This includes: (1) makes available core swap data structures (swap_lock, swap_list and swap_info) that are needed by frontswap.c but we don't need to expose them to the dozens of files that include swap.h so we create a new swapfile.h just to extern-ify these and modify their declarations to non-static (2) adds frontswap-related elements to swap_info_struct. Frontswap_map points to vzalloc'ed one-bit-per-swap-page metadata that indicates whether the swap page is in frontswap or in the device and frontswap_pages counts how many pages are in frontswap. (3) adds hooks in the swap subsystem and extends try_to_unuse so that frontswap_shrink can do a "partial swapoff". Note that a failed frontswap_map allocation is safe... failure is noted by lack of "FS" in the subsequent printk. [v9: akpm@xxxxxxxxxxxxxxxxxxxx: mark some statics __read_mostly] [v9: akpm@xxxxxxxxxxxxxxxxxxxx: add clarifying comments] [v9: akpm@xxxxxxxxxxxxxxxxxxxx: no need to loop repeating try_to_unuse] [v9: error27@xxxxxxxxx: remove superfluous check for NULL] [v8: rebase to 3.0-rc4] [v8: kamezawa.hiroyu@xxxxxxxxxxxxxx: change counter to atomic_t to avoid races] [v8: kamezawa.hiroyu@xxxxxxxxxxxxxx: comment to clarify informational counters] [v7: rebase to 3.0-rc3] [v7: JBeulich@xxxxxxxxxx: add new swap struct elements only if config'd] [v6: rebase to 3.0-rc1] [v6: lliubbo@xxxxxxxxx: fix null pointer deref if vzalloc fails] [v6: konrad.wilk@xxxxxxxxx: various checks and code clarifications/comments] [v5: no change from v4] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer <dan.magenheimer@xxxxxxxxxx> Reviewed-by: Konrad Wilk <konrad.wilk@xxxxxxxxxx> Reviewed-by: Kamezawa Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> Acked-by: Jan Beulich <JBeulich@xxxxxxxxxx> Acked-by: Seth Jennings <sjenning@xxxxxxxxxxxxxxxxxx> Cc: Jeremy Fitzhardinge <jeremy@xxxxxxxx> Cc: Hugh Dickins <hughd@xxxxxxxxxx> Cc: Johannes Weiner <hannes@xxxxxxxxxxx> Cc: Nitin Gupta <ngupta@xxxxxxxxxx> Cc: Matthew Wilcox <matthew@xxxxxx> Cc: Chris Mason <chris.mason@xxxxxxxxxx> Cc: Rik Riel <riel@xxxxxxxxxx> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- Diffstat: include/linux/swap.h | 4 + include/linux/swapfile.h | 13 ++++ mm/page_io.c | 12 +++ mm/swapfile.c | 64 ++++++++++++++++----- 4 files changed, 80 insertions(+), 13 deletions(-) --- linux/include/linux/swapfile.h 1969-12-31 17:00:00.000000000 -0700 +++ frontswap-v9/include/linux/swapfile.h 2011-09-12 10:29:08.046699427 -0600 @@ -0,0 +1,13 @@ +#ifndef _LINUX_SWAPFILE_H +#define _LINUX_SWAPFILE_H + +/* + * these were static in swapfile.c but frontswap.c needs them and we don't + * want to expose them to the dozens of source files that include swap.h + */ +extern spinlock_t swap_lock; +extern struct swap_list_t swap_list; +extern struct swap_info_struct *swap_info[]; +extern int try_to_unuse(unsigned int, bool, unsigned long); + +#endif /* _LINUX_SWAPFILE_H */ --- linux/include/linux/swap.h 2011-08-08 08:19:25.880690134 -0600 +++ frontswap-v9/include/linux/swap.h 2011-09-12 10:29:08.047687058 -0600 @@ -194,6 +194,10 @@ struct swap_info_struct { struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ +#ifdef CONFIG_FRONTSWAP + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ + atomic_t frontswap_pages; /* frontswap pages in-use counter */ +#endif }; struct swap_list_t { --- linux/mm/swapfile.c 2011-08-08 08:19:26.336684746 -0600 +++ frontswap-v9/mm/swapfile.c 2011-09-12 10:44:20.352686551 -0600 @@ -32,6 +32,8 @@ #include <linux/memcontrol.h> #include <linux/poll.h> #include <linux/oom.h> +#include <linux/frontswap.h> +#include <linux/swapfile.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> @@ -43,7 +45,7 @@ static bool swap_count_continued(struct static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -54,9 +56,9 @@ static const char Unused_file[] = "Unuse static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -557,6 +559,7 @@ static unsigned char swap_entry_free(str swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); if ((p->flags & SWP_BLKDEV) && disk->fops->swap_slot_free_notify) disk->fops->swap_slot_free_notify(p->bdev, offset); @@ -1018,11 +1021,12 @@ static int unuse_mm(struct mm_struct *mm } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1048,6 +1052,12 @@ static unsigned int find_next_to_unuse(s prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1059,8 +1069,12 @@ static unsigned int find_next_to_unuse(s * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1093,7 +1107,7 @@ static int try_to_unuse(unsigned int typ * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1260,6 +1274,10 @@ static int try_to_unuse(unsigned int typ * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1519,7 +1537,8 @@ bad_bmap: } static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + unsigned long *frontswap_map) { int i, prev; @@ -1529,6 +1548,7 @@ static void enable_swap_info(struct swap else p->prio = --least_priority; p->swap_map = swap_map; + frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; nr_swap_pages += p->pages; total_swap_pages += p->pages; @@ -1545,6 +1565,7 @@ static void enable_swap_info(struct swap swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; + frontswap_init(p->type); spin_unlock(&swap_lock); } @@ -1616,7 +1637,7 @@ SYSCALL_DEFINE1(swapoff, const char __us spin_unlock(&swap_lock); oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ test_set_oom_score_adj(oom_score_adj); if (err) { @@ -1627,7 +1648,7 @@ SYSCALL_DEFINE1(swapoff, const char __us * sys_swapoff for this swap_info_struct at this point. */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); goto out_dput; } @@ -1653,9 +1674,11 @@ SYSCALL_DEFINE1(swapoff, const char __us swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_invalidate_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(frontswap_map_get(p)); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -2019,6 +2042,7 @@ SYSCALL_DEFINE2(swapon, const char __use sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2099,6 +2123,9 @@ SYSCALL_DEFINE2(swapon, const char __use error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(maxpages / sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2114,14 +2141,15 @@ SYSCALL_DEFINE2(swapon, const char __use if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); @@ -2312,6 +2340,10 @@ int valid_swaphandles(swp_entry_t entry, base++; spin_lock(&swap_lock); + if (frontswap_test(si, target)) { + spin_unlock(&swap_lock); + return 0; + } if (end > si->max) /* don't go beyond end of map */ end = si->max; @@ -2322,6 +2354,9 @@ int valid_swaphandles(swp_entry_t entry, break; if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; + /* Don't read in frontswap pages */ + if (frontswap_test(si, toff)) + break; } /* Count contiguous allocated slots below our target */ for (toff = target; --toff >= base; nr_pages++) { @@ -2330,6 +2365,9 @@ int valid_swaphandles(swp_entry_t entry, break; if (swap_count(si->swap_map[toff]) == SWAP_MAP_BAD) break; + /* Don't read in frontswap pages */ + if (frontswap_test(si, toff)) + break; } spin_unlock(&swap_lock); --- linux/mm/page_io.c 2011-07-20 14:50:42.395999221 -0600 +++ frontswap-v9/mm/page_io.c 2011-09-12 10:29:08.081690546 -0600 @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/swapops.h> #include <linux/writeback.h> +#include <linux/frontswap.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, st unlock_page(page); goto out; } + if (frontswap_put_page(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_get_page(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Fight unfair telecom internet charges in Canada: sign http://stopthemeter.ca/ Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>