Move out the code for unusing swap entries from loop in try_to_unuse() to separate function: try_to_unuse_swp_entry(). Export this new function in swapfile.h just like try_to_unuse() is exported. This new function will be used for unusing swap entries from subsystems (e.g. zswap). Signed-off-by: Krzysztof Kozlowski <k.kozlowski@xxxxxxxxxxx> --- include/linux/swapfile.h | 2 + mm/swapfile.c | 356 ++++++++++++++++++++++++---------------------- 2 files changed, 189 insertions(+), 169 deletions(-) diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e282624..68c24a7 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -9,5 +9,7 @@ extern spinlock_t swap_lock; extern struct swap_list_t swap_list; extern struct swap_info_struct *swap_info[]; extern int try_to_unuse(unsigned int, bool, unsigned long); +extern int try_to_unuse_swp_entry(struct mm_struct **start_mm, + struct swap_info_struct *si, swp_entry_t entry); #endif /* _LINUX_SWAPFILE_H */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 36af6ee..4ba21ec 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1100,6 +1100,190 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, } /* + * Returns: + * - negative on error, + * - 0 on success (entry unused, freed independently or shmem entry + * already released) + */ +int try_to_unuse_swp_entry(struct mm_struct **start_mm, + struct swap_info_struct *si, swp_entry_t entry) +{ + pgoff_t offset = swp_offset(entry); + unsigned char *swap_map; + unsigned char swcount; + struct page *page; + int retval = 0; + + if (signal_pending(current)) { + retval = -EINTR; + goto out; + } + + /* + * Get a page for the entry, using the existing swap + * cache page if there is one. Otherwise, get a clean + * page and read the swap into it. + */ + swap_map = &si->swap_map[offset]; + page = read_swap_cache_async(entry, + GFP_HIGHUSER_MOVABLE, NULL, 0); + if (!page) { + /* + * Either swap_duplicate() failed because entry + * has been freed independently, and will not be + * reused since sys_swapoff() already disabled + * allocation from here, or alloc_page() failed. + */ + if (!*swap_map) + retval = 0; + else + retval = -ENOMEM; + goto out; + } + + /* + * Don't hold on to start_mm if it looks like exiting. + */ + if (atomic_read(&(*start_mm)->mm_users) == 1) { + mmput(*start_mm); + *start_mm = &init_mm; + atomic_inc(&init_mm.mm_users); + } + + /* + * Wait for and lock page. When do_swap_page races with + * try_to_unuse, do_swap_page can handle the fault much + * faster than try_to_unuse can locate the entry. This + * apparently redundant "wait_on_page_locked" lets try_to_unuse + * defer to do_swap_page in such a case - in some tests, + * do_swap_page and try_to_unuse repeatedly compete. + */ + wait_on_page_locked(page); + wait_on_page_writeback(page); + lock_page(page); + wait_on_page_writeback(page); + + /* + * Remove all references to entry. + */ + swcount = *swap_map; + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + VM_BUG_ON(retval > 0); + /* page has already been unlocked and released */ + goto out; + } + if (swap_count(swcount) && *start_mm != &init_mm) + retval = unuse_mm(*start_mm, entry, page); + + if (swap_count(*swap_map)) { + int set_start_mm = (*swap_map >= swcount); + struct list_head *p = &(*start_mm)->mmlist; + struct mm_struct *new_start_mm = *start_mm; + struct mm_struct *prev_mm = *start_mm; + struct mm_struct *mm; + + atomic_inc(&new_start_mm->mm_users); + atomic_inc(&prev_mm->mm_users); + spin_lock(&mmlist_lock); + while (swap_count(*swap_map) && !retval && + (p = p->next) != &(*start_mm)->mmlist) { + mm = list_entry(p, struct mm_struct, mmlist); + if (!atomic_inc_not_zero(&mm->mm_users)) + continue; + spin_unlock(&mmlist_lock); + mmput(prev_mm); + prev_mm = mm; + + cond_resched(); + + swcount = *swap_map; + if (!swap_count(swcount)) /* any usage ? */ + ; + else if (mm == &init_mm) + set_start_mm = 1; + else + retval = unuse_mm(mm, entry, page); + + if (set_start_mm && *swap_map < swcount) { + mmput(new_start_mm); + atomic_inc(&mm->mm_users); + new_start_mm = mm; + set_start_mm = 0; + } + spin_lock(&mmlist_lock); + } + spin_unlock(&mmlist_lock); + mmput(prev_mm); + mmput(*start_mm); + *start_mm = new_start_mm; + } + if (retval) { + unlock_page(page); + page_cache_release(page); + goto out; + } + + /* + * If a reference remains (rare), we would like to leave + * the page in the swap cache; but try_to_unmap could + * then re-duplicate the entry once we drop page lock, + * so we might loop indefinitely; also, that page could + * not be swapped out to other storage meanwhile. So: + * delete from cache even if there's another reference, + * after ensuring that the data has been saved to disk - + * since if the reference remains (rarer), it will be + * read from disk into another page. Splitting into two + * pages would be incorrect if swap supported "shared + * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. + */ + if (swap_count(*swap_map) && + PageDirty(page) && PageSwapCache(page)) { + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + swap_writepage(page, &wbc); + lock_page(page); + wait_on_page_writeback(page); + } + + /* + * It is conceivable that a racing task removed this page from + * swap cache just before we acquired the page lock at the top, + * or while we dropped it in unuse_mm(). The page might even + * be back in swap cache on another swap area: that we must not + * delete, since it may not have been written out to swap yet. + */ + if (PageSwapCache(page) && + likely(page_private(page) == entry.val)) + delete_from_swap_cache(page); + + /* + * So we could skip searching mms once swap count went + * to 1, we did not mark any present ptes as dirty: must + * mark page dirty so shrink_page_list will preserve it. + */ + SetPageDirty(page); + unlock_page(page); + page_cache_release(page); + + /* + * Make sure that we aren't completely killing + * interactive performance. + */ + cond_resched(); +out: + return retval; +} + +/* * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. @@ -1112,10 +1296,6 @@ int try_to_unuse(unsigned int type, bool frontswap, { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned char *swap_map; - unsigned char swcount; - struct page *page; - swp_entry_t entry; unsigned int i = 0; int retval = 0; @@ -1142,172 +1322,10 @@ int try_to_unuse(unsigned int type, bool frontswap, * there are races when an instance of an entry might be missed. */ while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { - if (signal_pending(current)) { - retval = -EINTR; - break; - } - - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = swp_entry(type, i); - page = read_swap_cache_async(entry, - GFP_HIGHUSER_MOVABLE, NULL, 0); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - if (!*swap_map) - continue; - retval = -ENOMEM; - break; - } - - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - } - - /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page_locked" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page_locked(page); - wait_on_page_writeback(page); - lock_page(page); - wait_on_page_writeback(page); - - /* - * Remove all references to entry. - */ - swcount = *swap_map; - if (swap_count(swcount) == SWAP_MAP_SHMEM) { - retval = shmem_unuse(entry, page); - /* page has already been unlocked and released */ - if (retval < 0) - break; - continue; - } - if (swap_count(swcount) && start_mm != &init_mm) - retval = unuse_mm(start_mm, entry, page); - - if (swap_count(*swap_map)) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *prev_mm = start_mm; - struct mm_struct *mm; - - atomic_inc(&new_start_mm->mm_users); - atomic_inc(&prev_mm->mm_users); - spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - if (!atomic_inc_not_zero(&mm->mm_users)) - continue; - spin_unlock(&mmlist_lock); - mmput(prev_mm); - prev_mm = mm; - - cond_resched(); - - swcount = *swap_map; - if (!swap_count(swcount)) /* any usage ? */ - ; - else if (mm == &init_mm) - set_start_mm = 1; - else - retval = unuse_mm(mm, entry, page); - - if (set_start_mm && *swap_map < swcount) { - mmput(new_start_mm); - atomic_inc(&mm->mm_users); - new_start_mm = mm; - set_start_mm = 0; - } - spin_lock(&mmlist_lock); - } - spin_unlock(&mmlist_lock); - mmput(prev_mm); - mmput(start_mm); - start_mm = new_start_mm; - } - if (retval) { - unlock_page(page); - page_cache_release(page); + retval = try_to_unuse_swp_entry(&start_mm, si, + swp_entry(type, i)); + if (retval != 0) break; - } - - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_unmap could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Given how unuse_vma() targets one particular offset - * in an anon_vma, once the anon_vma has been determined, - * this splitting happens to be just what is needed to - * handle where KSM pages have been swapped out: re-reading - * is unnecessarily slow, but we can fix that later on. - */ - if (swap_count(*swap_map) && - PageDirty(page) && PageSwapCache(page)) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - }; - - swap_writepage(page, &wbc); - lock_page(page); - wait_on_page_writeback(page); - } - - /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock at the top, - * or while we dropped it in unuse_mm(). The page might even - * be back in swap cache on another swap area: that we must not - * delete, since it may not have been written out to swap yet. - */ - if (PageSwapCache(page) && - likely(page_private(page) == entry.val)) - delete_from_swap_cache(page); - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so shrink_page_list will preserve it. - */ - SetPageDirty(page); - unlock_page(page); - page_cache_release(page); - - /* - * Make sure that we aren't completely killing - * interactive performance. - */ - cond_resched(); if (frontswap && pages_to_unuse > 0) { if (!--pages_to_unuse) break; -- 1.7.9.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>