We created a new function __remove_swap_mapping_batch that allows all pages under the same swap partition to be removed from the swap cache's mapping in a single acquisition of the mapping's tree lock. This reduces the contention on the lock when multiple threads are reclaiming memory by swapping to the same swap partition. The handle_pgout_batch function is updated so all the pages under the same swap partition are unmapped together when the have been paged out. Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx> --- mm/vmscan.c | 426 ++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 286 insertions(+), 140 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9fc04e1..5e4b8ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -690,6 +690,103 @@ cannot_free: return 0; } +/* use this only for swap mapped pages */ +static void __remove_swap_mapping_batch(struct page *pages[], + bool reclaimed, short ret[], int nr) +{ + unsigned long flags; + struct page *page; + swp_entry_t swap[SWAP_BATCH]; + struct address_space *mapping; + + int i, batch_size; + + if (nr <= 0) + return; + + while (nr) { + mapping = page_mapping(pages[0]); + BUG_ON(!mapping); + + batch_size = min(nr, SWAP_BATCH); + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < batch_size; ++i) { + page = pages[i]; + + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(mapping != page_mapping(page)); + + /* stop batching if mapping changes */ + if (mapping != page_mapping(page)) { + batch_size = i; + break; + } + /* + * The non racy check for a busy page. + * + * Must be careful with the order of the tests. When someone has + * a ref to the page, it may be possible that they dirty it then + * drop the reference. So if PageDirty is tested before page_count + * here, then the following race may occur: + * + * get_user_pages(&page); + * [user mapping goes away] + * write_to(page); + * !PageDirty(page) [good] + * SetPageDirty(page); + * put_page(page); + * !page_count(page) [good, discard it] + * + * [oops, our write_to data is lost] + * + * Reversing the order of the tests ensures such a situation cannot + * escape unnoticed. The smp_rmb is needed to ensure the page->flags + * load is not satisfied before that of page->_count. + * + * Note that if SetPageDirty is always performed via set_page_dirty, + * and thus under tree_lock, then this ordering is not required. + */ + if (!page_ref_freeze(page, 2)) + goto cannot_free; + /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + if (unlikely(PageDirty(page))) { + page_ref_unfreeze(page, 2); + goto cannot_free; + } + + swap[i].val = page_private(page); + __delete_from_swap_cache(page); + + ret[i] = 1; + continue; + +cannot_free: + ret[i] = 0; + } + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + /* need to keep irq off for mem_cgroup accounting, don't restore flags yet */ + local_irq_disable(); + for (i = 0; i < batch_size; ++i) { + if (ret[i]) { + page = pages[i]; + mem_cgroup_swapout(page, swap[i]); + } + } + local_irq_enable(); + + for (i = 0; i < batch_size; ++i) { + if (ret[i]) + swapcache_free(swap[i]); + } + /* advance to next batch */ + pages += batch_size; + ret += batch_size; + nr -= batch_size; + } +} /* * Attempt to detach a locked page from its ->mapping. If it is dirty or if * someone else has a ref on the page, abort and return 0. If it was @@ -897,177 +994,226 @@ static void handle_pgout_batch(struct list_head *page_list, int nr) { struct address_space *mapping; + struct page *umap_pages[SWAP_BATCH]; struct page *page; - int i; - - for (i = 0; i < nr; ++i) { - page = pages[i]; - mapping = page_mapping(page); + int i, j, batch_size; + short umap_ret[SWAP_BATCH], idx[SWAP_BATCH]; + + while (nr) { + j = 0; + batch_size = min(nr, SWAP_BATCH); + mapping = NULL; + + for (i = 0; i < batch_size; ++i) { + page = pages[i]; + + if (mapping) { + if (mapping != page_mapping(page)) { + /* mapping change, stop batch here */ + batch_size = i; + break; + } + } else + mapping = page_mapping(page); - /* check outcome of cache addition */ - if (!ret[i]) { - ret[i] = PG_ACTIVATE_LOCKED; - continue; - } - /* - * The page is mapped into the page tables of one or more - * processes. Try to unmap it here. - */ - if (page_mapped(page) && mapping) { - switch (swap_ret[i] = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + /* check outcome of cache addition */ + if (!ret[i]) { ret[i] = PG_ACTIVATE_LOCKED; continue; - case SWAP_AGAIN: - ret[i] = PG_KEEP_LOCKED; - continue; - case SWAP_MLOCK: - ret[i] = PG_MLOCKED; - continue; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */ } - } - - if (PageDirty(page)) { /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. + * The page is mapped into the page tables of one or more + * processes. Try to unmap it here. */ - if (page_is_file_cache(page) && - (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { + if (page_mapped(page) && mapping) { + switch (swap_ret[i] = try_to_unmap(page, lazyfree ? + (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : + (ttu_flags | TTU_BATCH_FLUSH))) { + case SWAP_FAIL: + ret[i] = PG_ACTIVATE_LOCKED; + continue; + case SWAP_AGAIN: + ret[i] = PG_KEEP_LOCKED; + continue; + case SWAP_MLOCK: + ret[i] = PG_MLOCKED; + continue; + case SWAP_LZFREE: + goto lazyfree; + case SWAP_SUCCESS: + ; /* try to free the page below */ + } + } + + if (PageDirty(page)) { /* - * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated - * and know it's dirty + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); - - ret[i] = PG_KEEP_LOCKED; - continue; - } + if (page_is_file_cache(page) && + (!current_is_kswapd() || + !test_bit(ZONE_DIRTY, &zone->flags))) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); - if (references == PAGEREF_RECLAIM_CLEAN) { - ret[i] = PG_KEEP_LOCKED; - continue; - } - if (!may_enter_fs) { - ret[i] = PG_KEEP_LOCKED; - continue; - } - if (!sc->may_writepage) { - ret[i] = PG_KEEP_LOCKED; - continue; - } + ret[i] = PG_KEEP_LOCKED; + continue; + } - /* - * Page is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after IO - * starts and then write it out here. - */ - try_to_unmap_flush_dirty(); - switch (pageout(page, mapping, sc)) { - case PAGE_KEEP: - ret[i] = PG_KEEP_LOCKED; - continue; - case PAGE_ACTIVATE: - ret[i] = PG_ACTIVATE_LOCKED; - continue; - case PAGE_SUCCESS: - if (PageWriteback(page)) { - ret[i] = PG_KEEP; + if (references == PAGEREF_RECLAIM_CLEAN) { + ret[i] = PG_KEEP_LOCKED; + continue; + } + if (!may_enter_fs) { + ret[i] = PG_KEEP_LOCKED; continue; } - if (PageDirty(page)) { - ret[i] = PG_KEEP; + if (!sc->may_writepage) { + ret[i] = PG_KEEP_LOCKED; continue; } /* - * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. */ - if (!trylock_page(page)) { - ret[i] = PG_KEEP; - continue; - } - if (PageDirty(page) || PageWriteback(page)) { + try_to_unmap_flush_dirty(); + switch (pageout(page, mapping, sc)) { + case PAGE_KEEP: ret[i] = PG_KEEP_LOCKED; continue; + case PAGE_ACTIVATE: + ret[i] = PG_ACTIVATE_LOCKED; + continue; + case PAGE_SUCCESS: + if (PageWriteback(page)) { + ret[i] = PG_KEEP; + continue; + } + if (PageDirty(page)) { + ret[i] = PG_KEEP; + continue; + } + + /* + * A synchronous write - probably a ramdisk. Go + * ahead and try to reclaim the page. + */ + if (!trylock_page(page)) { + ret[i] = PG_KEEP; + continue; + } + if (PageDirty(page) || PageWriteback(page)) { + ret[i] = PG_KEEP_LOCKED; + continue; + } + mapping = page_mapping(page); + case PAGE_CLEAN: + ; /* try to free the page below */ } - mapping = page_mapping(page); - case PAGE_CLEAN: - ; /* try to free the page below */ } - } - /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. - * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. - * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_complete_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. - */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) { - ret[i] = PG_ACTIVATE_LOCKED; + /* + * If the page has buffers, try to free the buffer mappings + * associated with this page. If we succeed we try to free + * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 1) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. + */ + if (page_has_private(page)) { + if (!try_to_release_page(page, sc->gfp_mask)) { + ret[i] = PG_ACTIVATE_LOCKED; + continue; + } + if (!mapping && page_count(page) == 1) { + unlock_page(page); + if (put_page_testzero(page)) { + ret[i] = PG_FREE; + continue; + } else { + /* + * rare race with speculative reference. + * the speculative reference will free + * this page shortly, so we may + * increment nr_reclaimed (and + * leave it off the LRU). + */ + ret[i] = PG_SPECULATIVE_REF; + continue; + } + } + } +lazyfree: + if (!mapping) { + ret[i] = PG_KEEP_LOCKED; continue; } - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) { - ret[i] = PG_FREE; - continue; - } else { - /* - * rare race with speculative reference. - * the speculative reference will free - * this page shortly, so we may - * increment nr_reclaimed (and - * leave it off the LRU). - */ - ret[i] = PG_SPECULATIVE_REF; + if (!PageSwapCache(page)) { + if (!__remove_mapping(mapping, page, true)) { + ret[i] = PG_KEEP_LOCKED; continue; } + __ClearPageLocked(page); + ret[i] = PG_FREE; + continue; } + + /* note pages to be unmapped */ + ret[i] = PG_UNKNOWN; + idx[j] = i; + umap_pages[j] = page; + ++j; } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) { - ret[i] = PG_KEEP_LOCKED; - continue; + + /* handle remaining pages that need to be unmapped */ + __remove_swap_mapping_batch(umap_pages, true, umap_ret, j); + + for (i = 0; i < j; ++i) { + if (!umap_ret[i]) { + /* unmap failed */ + ret[idx[i]] = PG_KEEP_LOCKED; + continue; + } + + page = umap_pages[i]; + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __ClearPageLocked(page); + ret[idx[i]] = PG_FREE; } - /* - * At this point, we have no other references and there is - * no way to pick any more up (removed from LRU, removed - * from pagecache). Can use non-atomic bitops now (and - * we obviously don't have to worry about waking up a process - * waiting on the page lock, because there are no references. - */ - __ClearPageLocked(page); - ret[i] = PG_FREE; + /* advance pointers to next batch and remaining page count */ + nr = nr - batch_size; + pages += batch_size; + ret += batch_size; + swap_ret += batch_size; } } -- 2.5.5 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>