Migration Cache 3/5 - move page from migration to swap cache This patch modifies the swapfile.c "unuse_*" stack to support moving pages from migration cache to swap cache in case we have to "fall back to swap". This also allows vmscan.c:shrink_page_list() to move migration cache pages to swap cache when/if it wants to swap them out. shrink_page_list() should only find anon pages in the migration cache when do_mbind() is called with MPOL_MF_MOVE + '_MF_LAZY or when lazy automigration is enabled. Because of the new usage, the patch renames the static "unuse_*" functions in swapfile.c to "update_*". In "update_pte_range", if the entry arg matches the page's private data, we perform the usual "unuse_pte()"; otherwise, this is an "update/move" operation and we "update_pte()". Then, this patch implements the __migration_move_to_swap() function on top of the modified "update_*" stack. Assumption: because this facility is used only for removing swap devices [sys_swapoff()] and swapping out migration cached pages, it is not in a critical/fast path. Signed-off-by: Lee Schermerhorn <lee.schermerhorn@xxxxxx> include/linux/rmap.h | 4 ++ include/linux/swap.h | 7 +++- mm/rmap.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++ mm/swapfile.c | 76 +++++++++++++++++++++++++++++++++++++++++++------ 4 files changed, 154 insertions(+), 11 deletions(-) Index: linux-2.6.36-mmotm-101103-1217/include/linux/swap.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/swap.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/swap.h @@ -8,6 +8,7 @@ #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> +#include <linux/rmap.h> #include <asm/atomic.h> #include <asm/page.h> @@ -405,6 +406,8 @@ extern int migration_add_reference_page extern int migration_ref_count(swp_entry_t); extern void __migration_remove_reference(struct page *, swp_entry_t); extern struct page *lookup_migration_cache(swp_entry_t); +extern int __migration_move_to_swap(struct vm_area_struct *, struct page *, + swp_entry_t); #ifdef PAGE_FLAGS_PRIVATE /* only where this is defined */ /** @@ -412,7 +415,7 @@ extern struct page *lookup_migration_cac * @page: page to check/add * * For vmscan:shrink_page_list(): - * if @page in migration cache, fail -- until "move to swap" available. + * if @page in migration cache, try to move to swap cache. * if @page already in swap cache -- OK to swap out. * else try to add @page to swap cache */ @@ -420,7 +423,7 @@ static inline int check_add_to_swap(stru { if (PageSwapCache(page)) { if (page_in_migration_cache(page)) - return 0; /* Fail -- TODO: move to swap */ + return !migration_move_to_swap(page); else return 1; /* already in swap cache */ } Index: linux-2.6.36-mmotm-101103-1217/mm/swapfile.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/swapfile.c +++ linux-2.6.36-mmotm-101103-1217/mm/swapfile.c @@ -896,6 +896,41 @@ unsigned int count_swap_pages(int type, #endif /* CONFIG_HIBERNATION */ /* + * replace [migration cache] pte in pmd @ addr with swap pte built from + * swp_entry_t value in page's private data. Free [decrement ref count] + * previous [migration cache] entry + */ +static int update_pte(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, swp_entry_t entry, struct page *page) +{ +#ifdef CONFIG_MIGRATION + spinlock_t *ptl; + pte_t *pte; + int ret = 0; + + BUG_ON(!is_migration_cache(entry)); + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + if (likely(pte_same(*pte, swp_entry_to_pte(entry)))) { + swp_entry_t new_entry; + pte_t new_pte; + + new_entry.val = page_private(page); + new_pte = swp_entry_to_pte(new_entry); + set_pte_at(vma->vm_mm, addr, pte, new_pte); + + __migration_remove_reference(NULL, entry); + ret = 1; /* updated -- terminate outer loops */ + } + pte_unmap_unlock(pte, ptl); + + return ret; +#else + BUG(); /* shouldn't get here */ +#endif +} + +/* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to * force COW, vm_page_prot omits write permission from any private vma. @@ -940,7 +975,14 @@ out_nolock: return ret; } -static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, +/* + * @entry contains pte to replace in *pmd + * if @entry == page_private(page), "unuse" the swap pte--i.e., + * replace it with a real anon page pte + * else replace the pte with the swap entry in page_private(@page) + * [for moving migration cache pages to swap cache] + */ +static int update_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { @@ -965,7 +1007,10 @@ static int unuse_pte_range(struct vm_are */ if (unlikely(pte_same(*pte, swp_pte))) { pte_unmap(pte); - ret = unuse_pte(vma, pmd, addr, entry, page); + if (entry.val == page_private(page)) + ret = unuse_pte(vma, pmd, addr, entry, page); + else + ret = update_pte(vma, pmd, addr, entry, page); if (ret) goto out; pte = pte_offset_map(pmd, addr); @@ -976,7 +1021,7 @@ out: return ret; } -static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, +static inline int update_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { @@ -989,14 +1034,14 @@ static inline int unuse_pmd_range(struct next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - ret = unuse_pte_range(vma, pmd, addr, next, entry, page); + ret = update_pte_range(vma, pmd, addr, next, entry, page); if (ret) return ret; } while (pmd++, addr = next, addr != end); return 0; } -static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, +static inline int update_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, swp_entry_t entry, struct page *page) { @@ -1009,14 +1054,14 @@ static inline int unuse_pud_range(struct next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - ret = unuse_pmd_range(vma, pud, addr, next, entry, page); + ret = update_pmd_range(vma, pud, addr, next, entry, page); if (ret) return ret; } while (pud++, addr = next, addr != end); return 0; } -static int unuse_vma(struct vm_area_struct *vma, +static int update_vma(struct vm_area_struct *vma, swp_entry_t entry, struct page *page) { pgd_t *pgd; @@ -1039,7 +1084,7 @@ static int unuse_vma(struct vm_area_stru next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - ret = unuse_pud_range(vma, pgd, addr, next, entry, page); + ret = update_pud_range(vma, pgd, addr, next, entry, page); if (ret) return ret; } while (pgd++, addr = next, addr != end); @@ -1063,13 +1108,26 @@ static int unuse_mm(struct mm_struct *mm lock_page(page); } for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) + if (vma->anon_vma && (ret = update_vma(vma, entry, page))) break; } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; } +#ifdef CONFIG_MIGRATION_CACHE +/* + * replace migration cache pte for page with swap pte built + * from page_private(page). + */ +int __migration_move_to_swap(struct vm_area_struct *vma, + struct page *page, swp_entry_t entry) +{ + return update_vma(vma, entry, page); + +} +#endif + /* * Scan swap_map from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. Index: linux-2.6.36-mmotm-101103-1217/include/linux/rmap.h =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/include/linux/rmap.h +++ linux-2.6.36-mmotm-101103-1217/include/linux/rmap.h @@ -204,6 +204,10 @@ int try_to_unmap(struct page *, enum ttu int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); +#ifdef CONFIG_MIGRATION_CACHE +int migration_move_to_swap(struct page *); +#endif + /* * Called from mm/filemap_xip.c to unmap empty zero page */ Index: linux-2.6.36-mmotm-101103-1217/mm/rmap.c =================================================================== --- linux-2.6.36-mmotm-101103-1217.orig/mm/rmap.c +++ linux-2.6.36-mmotm-101103-1217/mm/rmap.c @@ -355,6 +355,84 @@ void page_unlock_anon_vma(struct anon_vm rcu_read_unlock(); } +#ifdef CONFIG_MIGRATION_CACHE +/* + * Move a page in the migration cache to the swap cache when + * vmscan finds anon page swap candidate in migration cache. + * Return !0 on success; 0 otherwise + * + * Must hold page lock. + */ +int migration_move_to_swap(struct page *page) +{ + struct anon_vma *anon_vma; + struct anon_vma_chain *avc; + swp_entry_t entry; + int moved = 0; + int ret = 0; + + BUG_ON(!PageLocked(page)); + BUG_ON(!page_in_migration_cache(page)); + + /* + * Optimistically add page to swap cache + */ + entry.val = page_private(page); /* save for move */ + set_page_private(page, 0); /* prepare for __add_to_swap() */ + ClearPageSwapCache(page); + if (!__add_to_swap(page, 0)) + goto out; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) { + delete_from_swap_cache(page); /* back out */ + goto out; /* nothing to move */ + } + + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + if (!__migration_move_to_swap(vma, page, entry)) { + page_unlock_anon_vma(anon_vma); + /* + * If we've moved any pages, we're left with page + * partially in migration cache, partially in swap + * cache. Can't be good! + */ + if (moved) { + printk (KERN_ERR + "%s failed after moving %d entries\n", + __FUNCTION__, moved); + BUG(); + } + goto out; + } + moved++; + } + + page_unlock_anon_vma(anon_vma); + + /* + * __add_to_swap() added another ref to page for swap cache. + * __migration_move_to_swap() did NOT remove the migration + * cache's ref on the page, so drop it here, after replacing + * all migration ptes. + */ + page_cache_release(page); + ret = 1; + +out: + if (!ret) { + /* + * restore migration cache entry on error. + */ + set_page_private(page, entry.val); + SetPageSwapCache(page); + } + return ret; +} +#endif /* _MIGRATION_CACHE */ + + /* * At what user virtual address is page expected in @vma? * Returns virtual address or -EFAULT if page's index/offset is not -- To unsubscribe from this list: send the line "unsubscribe linux-numa" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html