On Tue, May 31, 2016 at 10:17:23AM -0700, Tim Chen wrote: > On Tue, May 31, 2016 at 06:15:50PM +0900, Minchan Kim wrote: > > Hello Tim, > > > > checking file mm/vmscan.c > > patch: **** malformed patch at line 89: mapping->a_ops->is_dirty_writeback(page, dirty, writeback); > > > > Could you resend formal patch? > > > > Thanks. > > My mail client is misbehaving after a system upgrade. > Here's the patch again. > > > Subject: [PATCH] mm: Cleanup - Reorganize the shrink_page_list code into smaller functions > > This patch consolidates the page out and the varous cleanup operations > within shrink_page_list function into handle_pgout and pg_finish > functions. > > This makes the shrink_page_list function more concise and allows for > the separation of page out and page scan operations at a later time. > This is desirable if we want to group similar pages together and batch > process them in the page out path. > > After we have scanned a page shrink_page_list and completed any paging, > the final disposition and clean up of the page is conslidated into > pg_finish. The designated disposition of the page from page scanning > in shrink_page_list is marked with one of the designation in pg_result. > > There is no intention to change any functionality or logic in this patch. > > Signed-off-by: Tim Chen <tim.c.chen@xxxxxxxxxxxxxxx> Hi Tim, To me, this reorganization is too limited and not good for me, frankly speaking. It works for only your goal which allocate batch swap slot, I guess. :) My goal is to make them work with batch page_check_references, batch try_to_unmap and batch __remove_mapping where we can avoid frequent mapping->lock(e.g., anon_vma or i_mmap_lock with hoping such batch locking help system performance) if batch pages has same inode or anon. So, to show my intention roughly, I coded in a short time which never cannot work and compiled. Just show the intention. If you guys think it's worth, I will try to make complete patch. Thanks. diff --git a/mm/vmscan.c b/mm/vmscan.c index 2aec4241b42a..5276b160db00 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -106,6 +106,15 @@ struct scan_control { unsigned long nr_reclaimed; }; +struct congest_control { + unsigned long nr_unqueued_dirty; + unsigned long nr_dirty; + unsigned long nr_congested; + unsigned long nr_writeback; + unsigned long nr_immediate; + gfp_t gfp_mask; +}; + #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) #ifdef ARCH_HAS_PREFETCH @@ -874,360 +883,260 @@ static void page_check_dirty_writeback(struct page *page, mapping->a_ops->is_dirty_writeback(page, dirty, writeback); } -/* - * shrink_page_list() returns the number of reclaimed pages - */ -static unsigned long shrink_page_list(struct list_head *page_list, - struct zone *zone, - struct scan_control *sc, - enum ttu_flags ttu_flags, - unsigned long *ret_nr_dirty, - unsigned long *ret_nr_unqueued_dirty, - unsigned long *ret_nr_congested, - unsigned long *ret_nr_writeback, - unsigned long *ret_nr_immediate, - bool force_reclaim) +static int spl_batch_pages(struct list_head *page_list, + struct list_head *failed_list, + int (*process)(struct page *page, void *private), + void *private) { - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - int pgactivate = 0; - unsigned long nr_unqueued_dirty = 0; - unsigned long nr_dirty = 0; - unsigned long nr_congested = 0; - unsigned long nr_reclaimed = 0; - unsigned long nr_writeback = 0; - unsigned long nr_immediate = 0; - - cond_resched(); - - while (!list_empty(page_list)) { - struct address_space *mapping; - struct page *page; - int may_enter_fs; - enum page_references references = PAGEREF_RECLAIM_CLEAN; - bool dirty, writeback; - - cond_resched(); - - page = lru_to_page(page_list); - list_del(&page->lru); - - if (!trylock_page(page)) - goto keep; + struct page *page, *tmp; + int ret; + int nr_failed = 0; - VM_BUG_ON_PAGE(PageActive(page), page); - VM_BUG_ON_PAGE(page_zone(page) != zone, page); + list_for_each_entry_safe(page, tmp, page_list, lru) { + ret = process(page, private); + if (!ret) { + list_move(&page->lru, failed_list); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), + page); + nr_failed++; + } + } - sc->nr_scanned++; + return nr_failed; +} - if (unlikely(!page_evictable(page))) - goto cull_mlocked; +static int spl_trylock_page(struct page *page, void *private) +{ + struct scan_control *sc = private; - if (!sc->may_unmap && page_mapped(page)) - goto keep_locked; + VM_BUG_ON_PAGE(PageActive(page), page); + sc->nr_scanned++; - /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) - sc->nr_scanned++; + return trylock_page(page); +} - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); +static int spl_check_evictable(struct page *page, void *private) +{ + struct scan_control *sc = private; - /* - * The number of dirty pages determines if a zone is marked - * reclaim_congested which affects wait_iff_congested. kswapd - * will stall and start writing pages if the tail of the LRU - * is all dirty unqueued pages. - */ - page_check_dirty_writeback(page, &dirty, &writeback); - if (dirty || writeback) - nr_dirty++; + if (unlikely(!page_evictable(page))) { + if (PageSwapCache(page)) + try_to_free_swap(page); + unlock_page(page); + return 0; + } - if (dirty && !writeback) - nr_unqueued_dirty++; + if (!sc->may_unmap && page_mapped(page)) { + unlock_page(page); + return 0; + } - /* - * Treat this page as congested if the underlying BDI is or if - * pages are cycling through the LRU so quickly that the - * pages marked for immediate reclaim are making it to the - * end of the LRU a second time. - */ - mapping = page_mapping(page); - if (((dirty || writeback) && mapping && - inode_write_congested(mapping->host)) || - (writeback && PageReclaim(page))) - nr_congested++; + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + sc->nr_scanned++; - /* - * If a page at the tail of the LRU is under writeback, there - * are three cases to consider. - * - * 1) If reclaim is encountering an excessive number of pages - * under writeback and this page is both under writeback and - * PageReclaim then it indicates that pages are being queued - * for IO but are being recycled through the LRU before the - * IO can complete. Waiting on the page itself risks an - * indefinite stall if it is impossible to writeback the - * page due to IO error or disconnected storage so instead - * note that the LRU is being scanned too quickly and the - * caller can stall after page list has been processed. - * - * 2) Global or new memcg reclaim encounters a page that is - * not marked for immediate reclaim, or the caller does not - * have __GFP_FS (or __GFP_IO if it's simply going to swap, - * not to fs). In this case mark the page for immediate - * reclaim and continue scanning. - * - * Require may_enter_fs because we would wait on fs, which - * may not have submitted IO yet. And the loop driver might - * enter reclaim, and deadlock if it waits on a page for - * which it is needed to do the write (loop masks off - * __GFP_IO|__GFP_FS for this reason); but more thought - * would probably show more reasons. - * - * 3) Legacy memcg encounters a page that is already marked - * PageReclaim. memcg does not have any dirty pages - * throttling so we could easily OOM just because too many - * pages are in writeback and there is nothing else to - * reclaim. Wait for the writeback to complete. - */ - if (PageWriteback(page)) { - /* Case 1 above */ - if (current_is_kswapd() && - PageReclaim(page) && - test_bit(ZONE_WRITEBACK, &zone->flags)) { - nr_immediate++; - goto keep_locked; - - /* Case 2 above */ - } else if (sane_reclaim(sc) || - !PageReclaim(page) || !may_enter_fs) { - /* - * This is slightly racy - end_page_writeback() - * might have just cleared PageReclaim, then - * setting PageReclaim here end up interpreted - * as PageReadahead - but that does not matter - * enough to care. What we do want is for this - * page to have PageReclaim set next time memcg - * reclaim reaches the tests above, so it will - * then wait_on_page_writeback() to avoid OOM; - * and it's also appropriate in global reclaim. - */ - SetPageReclaim(page); - nr_writeback++; - goto keep_locked; + return 1; +} - /* Case 3 above */ - } else { - unlock_page(page); - wait_on_page_writeback(page); - /* then go back and try same page again */ - list_add_tail(&page->lru, page_list); - continue; - } - } - if (!force_reclaim) - references = page_check_references(page, sc); - - switch (references) { - case PAGEREF_ACTIVATE: - goto activate_locked; - case PAGEREF_KEEP: - goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: - ; /* try to reclaim the page below */ - } +static int spl_check_congestion(struct page *page, void *private) +{ + bool dirty, writeback; + struct congest_control *cc = private; + gfp_t gfp_mask = cc->gfp_mask; + struct zone *zone = page_zone(page); + struct address_space *mapping; + int may_enter_fs; - /* - * Anonymous process memory has backing store? - * Try to allocate it some swap space here. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!(sc->gfp_mask & __GFP_IO)) - goto keep_locked; - if (!add_to_swap(page, page_list)) - goto activate_locked; - may_enter_fs = 1; - - /* Adding to swap updated mapping */ - mapping = page_mapping(page); - } + may_enter_fs = (gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (gfp_mask & __GFP_IO)); - /* - * The page is mapped into the page tables of one or more - * processes. Try to unmap it here. - */ - if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, - ttu_flags|TTU_BATCH_FLUSH)) { - case SWAP_FAIL: - goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_SUCCESS: - ; /* try to free the page below */ - } - } - - if (PageDirty(page)) { - /* - * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but only writeback - * if many dirty pages have been encountered. - */ - if (page_is_file_cache(page) && - (!current_is_kswapd() || - !test_bit(ZONE_DIRTY, &zone->flags))) { - /* - * Immediately reclaim when written back. - * Similar in principal to deactivate_page() - * except we already have the page isolated - * and know it's dirty - */ - inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); - SetPageReclaim(page); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + cc->nr_dirty++; - goto keep_locked; - } + if (dirty && !writeback) + cc->nr_unqueued_dirty++; - if (references == PAGEREF_RECLAIM_CLEAN) - goto keep_locked; - if (!may_enter_fs) - goto keep_locked; - if (!sc->may_writepage) - goto keep_locked; + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if (((dirty || writeback) && mapping && + inode_write_congested(mapping->host)) || + (writeback && PageReclaim(page))) + cc->nr_congested++; + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate + * reclaim and continue scanning. + * + * Require may_enter_fs because we would wait on fs, which + * may not have submitted IO yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * 3) Legacy memcg encounters a page that is already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ + if (PageWriteback(page)) { + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + test_bit(ZONE_WRITEBACK, &zone->flags)) { + cc->nr_immediate++; + unlock_page(page); + return 0; + /* Case 2 above */ + } else if (sane_reclaim(sc) || + !PageReclaim(page) || !may_enter_fs) { /* - * Page is dirty. Flush the TLB if a writable entry - * potentially exists to avoid CPU writes after IO - * starts and then write it out here. + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. */ - try_to_unmap_flush_dirty(); - switch (pageout(page, mapping, sc)) { - case PAGE_KEEP: - goto keep_locked; - case PAGE_ACTIVATE: - goto activate_locked; - case PAGE_SUCCESS: - if (PageWriteback(page)) - goto keep; - if (PageDirty(page)) - goto keep; + SetPageReclaim(page); + cc->nr_writeback++; + unlock_page(page); + return 0; - /* - * A synchronous write - probably a ramdisk. Go - * ahead and try to reclaim the page. - */ - if (!trylock_page(page)) - goto keep; - if (PageDirty(page) || PageWriteback(page)) - goto keep_locked; - mapping = page_mapping(page); - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - /* - * If the page has buffers, try to free the buffer mappings - * associated with this page. If we succeed we try to free - * the page as well. - * - * We do this even if the page is PageDirty(). - * try_to_release_page() does not perform I/O, but it is - * possible for a page to have PageDirty set, but it is actually - * clean (all its buffers are clean). This happens if the - * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. - * try_to_release_page() will discover that cleanness and will - * drop the buffers and mark the page clean - it can be freed. - * - * Rarely, pages can have buffers and no ->mapping. These are - * the pages which were not successfully invalidated in - * truncate_complete_page(). We try to drop those buffers here - * and if that worked, and the page is no longer mapped into - * process address space (page_count == 1) it can be freed. - * Otherwise, leave the page on the LRU so it is swappable. - */ - if (page_has_private(page)) { - if (!try_to_release_page(page, sc->gfp_mask)) - goto activate_locked; - if (!mapping && page_count(page) == 1) { - unlock_page(page); - if (put_page_testzero(page)) - goto free_it; - else { - /* - * rare race with speculative reference. - * the speculative reference will free - * this page shortly, so we may - * increment nr_reclaimed here (and - * leave it off the LRU). - */ - nr_reclaimed++; - continue; - } - } + /* Case 3 above */ + } else { + unlock_page(page); + wait_on_page_writeback(page); + /* XXXX */ + /* then go back and try same page again */ + // list_move(&page->lru, page_list); + // continue; } + } - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + return 1; +} - /* - * At this point, we have no other references and there is - * no way to pick any more up (removed from LRU, removed - * from pagecache). Can use non-atomic bitops now (and - * we obviously don't have to worry about waking up a process - * waiting on the page lock, because there are no references. - */ - __clear_page_locked(page); -free_it: - nr_reclaimed++; +static int spl_page_reference_check(struct page *page, void *private) +{ + struct scan_control *sc = private; + enum page_references references = PAGEREF_RECLAIM_CLEAN; - /* - * Is there need to periodically free_page_list? It would - * appear not as the counts should be low - */ - list_add(&page->lru, &free_pages); - continue; + references = page_check_references(page, sc); -cull_mlocked: - if (PageSwapCache(page)) + switch (references) { + case PAGEREF_ACTIVATE: + if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); + SetPageActive(page); + count_vm_event(PGACTIVATE); + return 0; + case PAGEREF_KEEP: unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; + return 0; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } -activate_locked: - /* Not a candidate for swapping, so reclaim swap space. */ + return 1; +} + +static int spl_page_reference_check(struct page *page, void *private) +{ + struct scan_control *sc = private; + enum page_references references = PAGEREF_RECLAIM_CLEAN; + + references = page_check_references(page, sc); + + switch (references) { + case PAGEREF_ACTIVATE: if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); - pgactivate++; -keep_locked: + count_vm_event(PGACTIVATE); + return 0; + case PAGEREF_KEEP: unlock_page(page); -keep: - list_add(&page->lru, &ret_pages); - VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); + return 0; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ } + return 1; +} + +/* + * shrink_page_list() returns the number of reclaimed pages + */ +static unsigned long shrink_page_list(struct list_head *page_list, + struct zone *zone, + struct scan_control *sc, + enum ttu_flags ttu_flags, + struct congest_control *cc, + bool force_reclaim) +{ + LIST_HEAD(ret_pages); + LIST_HEAD(free_pages); + int pgactivate = 0; + unsigned long nr_reclaimed = 0; + + cond_resched(); + + cc->gfp_mask = sc->gfp_mask; + spl_batch_pages(page_list, &ret_pages, spl_trylock_page, sc); + spl_batch_pages(page_list, &ret_pages, spl_check_evictable, sc); + spl_batch_pages(page_list, &ret_pages, spl_check_congestion, cc); + if (!force_reclaim) + spl_batch_pages(page_list, &ret_pages, + spl_page_reference_check, sc); + spl_batch_pages(page_list, &ret_pages, + spl_alloc_swap_slot, sc); + spl_batch_pages(page_list, &ret_pages, + spl_try_to_unmap, sc); + spl_batch_pages(page_list, &free_pages, + spl_try_to_free, sc); + mem_cgroup_uncharge_list(&free_pages); - try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - *ret_nr_dirty += nr_dirty; - *ret_nr_congested += nr_congested; - *ret_nr_unqueued_dirty += nr_unqueued_dirty; - *ret_nr_writeback += nr_writeback; - *ret_nr_immediate += nr_immediate; return nr_reclaimed; } @@ -1239,8 +1148,9 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; + struct congest_control cc; struct page *page, *next; + unsigned long ret; LIST_HEAD(clean_pages); list_for_each_entry_safe(page, next, page_list, lru) { @@ -1252,8 +1162,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, &cc, true); list_splice(&clean_pages, page_list); mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; @@ -1571,6 +1480,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, int file = is_file_lru(lru); struct zone *zone = lruvec_zone(lruvec); struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; + struct congest_control cc; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1607,10 +1517,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; + memset(&cc, 0, sizeof(struct congest_control)); nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_unqueued_dirty, &nr_congested, - &nr_writeback, &nr_immediate, - false); + &cc, false); spin_lock_irq(&zone->lru_lock); -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href