The patch titled Subject: psi: annotate refault stalls from IO submission has been removed from the -mm tree. Its filename was psi-annotate-refault-stalls-from-io-submission.patch This patch was dropped because an alternative patch was merged ------------------------------------------------------ From: Johannes Weiner <hannes@xxxxxxxxxxx> Subject: psi: annotate refault stalls from IO submission psi tracks the time tasks wait for refaulting pages to become uptodate, but it does not track the time spent submitting the IO. The submission part can be significant if backing storage is contended or when cgroup throttling (io.latency) is in effect - a lot of time is spent in submit_bio(). In that case, we underreport memory pressure. The error scales with how aggressively IO is throttled compared to the device's capability. For example, we have system maintenance software throttled down pretty hard on IO compared to the workload. When memory is contended, the system software starts thrashing cache, but since the backing device is actually pretty fast, the majority of "io time" is from injected throttling delays during submit_bio(). As a result we barely see memory pressure, when the reality is that there is almost no progress due to the thrashing and we should be killing misbehaving stuff. Annotate the submit_bio() paths (or the indirection through readpage) for refaults and swapin to get proper psi coverage of delays there. Link: http://lkml.kernel.org/r/20190722201337.19180-1-hannes@xxxxxxxxxxx Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx> Reviewed-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> Cc: Dave Chinner <david@xxxxxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- fs/btrfs/extent_io.c | 14 ++++++++++++-- fs/ext4/readpage.c | 9 +++++++++ fs/f2fs/data.c | 8 ++++++++ fs/mpage.c | 9 +++++++++ mm/filemap.c | 20 ++++++++++++++++++++ mm/page_io.c | 11 ++++++++--- mm/readahead.c | 24 +++++++++++++++++++++++- 7 files changed, 89 insertions(+), 6 deletions(-) --- a/fs/btrfs/extent_io.c~psi-annotate-refault-stalls-from-io-submission +++ a/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include <linux/psi.h> #include "extent_io.h" #include "extent_map.h" #include "ctree.h" @@ -4265,6 +4266,9 @@ int extent_readpages(struct address_spac struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree; int nr = 0; u64 prev_em_start = (u64)-1; + int ret = 0; + bool refault = false; + unsigned long pflags; while (!list_empty(pages)) { u64 contig_end = 0; @@ -4279,6 +4283,10 @@ int extent_readpages(struct address_spac put_page(page); break; } + if (PageWorkingset(page) && !refault) { + psi_memstall_enter(&pflags); + refault = true; + } pagepool[nr++] = page; contig_end = page_offset(page) + PAGE_SIZE - 1; @@ -4299,8 +4307,10 @@ int extent_readpages(struct address_spac free_extent_map(em_cached); if (bio) - return submit_one_bio(bio, 0, bio_flags); - return 0; + ret = submit_one_bio(bio, 0, bio_flags); + if (refault) + psi_memstall_leave(&pflags); + return ret; } /* --- a/fs/ext4/readpage.c~psi-annotate-refault-stalls-from-io-submission +++ a/fs/ext4/readpage.c @@ -44,6 +44,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/cleancache.h> +#include <linux/psi.h> #include "ext4.h" @@ -116,6 +117,8 @@ int ext4_mpage_readpages(struct address_ int length; unsigned relative_block = 0; struct ext4_map_blocks map; + bool refault = false; + unsigned long pflags; map.m_pblk = 0; map.m_lblk = 0; @@ -134,6 +137,10 @@ int ext4_mpage_readpages(struct address_ if (add_to_page_cache_lru(page, mapping, page->index, readahead_gfp_mask(mapping))) goto next_page; + if (PageWorkingset(page) && !refault) { + psi_memstall_enter(&pflags); + refault = true; + } } if (page_has_buffers(page)) @@ -291,5 +298,7 @@ int ext4_mpage_readpages(struct address_ BUG_ON(pages && !list_empty(pages)); if (bio) submit_bio(bio); + if (refault) + psi_memstall_leave(&pflags); return 0; } --- a/fs/f2fs/data.c~psi-annotate-refault-stalls-from-io-submission +++ a/fs/f2fs/data.c @@ -1699,6 +1699,8 @@ static int f2fs_mpage_readpages(struct a sector_t last_block_in_bio = 0; struct inode *inode = mapping->host; struct f2fs_map_blocks map; + bool refault = false; + unsigned long pflags; int ret = 0; map.m_pblk = 0; @@ -1720,6 +1722,10 @@ static int f2fs_mpage_readpages(struct a page_index(page), readahead_gfp_mask(mapping))) goto next_page; + if (PageWorkingset(page) && !refault) { + psi_memstall_enter(&pflags); + refault = true; + } } ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio, @@ -1736,6 +1742,8 @@ next_page: BUG_ON(pages && !list_empty(pages)); if (bio) __submit_bio(F2FS_I_SB(inode), bio, DATA); + if (refault) + psi_memstall_leave(&pflags); return pages ? 0 : ret; } --- a/fs/mpage.c~psi-annotate-refault-stalls-from-io-submission +++ a/fs/mpage.c @@ -30,6 +30,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/cleancache.h> +#include <linux/psi.h> #include "internal.h" /* @@ -389,6 +390,8 @@ mpage_readpages(struct address_space *ma .get_block = get_block, .is_readahead = true, }; + bool refault = false; + unsigned long pflags; unsigned page_idx; for (page_idx = 0; page_idx < nr_pages; page_idx++) { @@ -404,10 +407,16 @@ mpage_readpages(struct address_space *ma args.bio = do_mpage_readpage(&args); } put_page(page); + if (PageWorkingset(page) && !refault) { + psi_memstall_enter(&pflags); + refault = true; + } } BUG_ON(!list_empty(pages)); if (args.bio) mpage_bio_submit(REQ_OP_READ, REQ_RAHEAD, args.bio); + if (refault) + psi_memstall_leave(&pflags); return 0; } EXPORT_SYMBOL(mpage_readpages); --- a/mm/filemap.c~psi-annotate-refault-stalls-from-io-submission +++ a/mm/filemap.c @@ -2035,6 +2035,8 @@ static ssize_t generic_file_buffered_rea pgoff_t end_index; loff_t isize; unsigned long nr, ret; + unsigned long pflags; + bool refault; cond_resched(); find_page: @@ -2183,9 +2185,17 @@ readpage: * PG_error will be set again if readpage fails. */ ClearPageError(page); + + refault = PageWorkingset(page); + if (refault) + psi_memstall_enter(&pflags); + /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); + if (refault) + psi_memstall_leave(&pflags); + if (unlikely(error)) { if (error == AOP_TRUNCATED_PAGE) { put_page(page); @@ -2779,11 +2789,14 @@ static struct page *do_read_cache_page(s void *data, gfp_t gfp) { + bool refault = false; struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { + unsigned long pflags; + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); @@ -2796,12 +2809,19 @@ repeat: return ERR_PTR(err); } + refault = PageWorkingset(page); filler: + if (refault) + psi_memstall_enter(&pflags); + if (filler) err = filler(data, page); else err = mapping->a_ops->readpage(data, page); + if (refault) + psi_memstall_leave(&pflags); + if (err < 0) { put_page(page); return ERR_PTR(err); --- a/mm/page_io.c~psi-annotate-refault-stalls-from-io-submission +++ a/mm/page_io.c @@ -24,6 +24,7 @@ #include <linux/blkdev.h> #include <linux/uio.h> #include <linux/sched/task.h> +#include <linux/psi.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -354,10 +355,14 @@ int swap_readpage(struct page *page, boo struct swap_info_struct *sis = page_swap_info(page); blk_qc_t qc; struct gendisk *disk; + unsigned long pflags; VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); + + psi_memstall_enter(&pflags); + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); @@ -371,7 +376,7 @@ int swap_readpage(struct page *page, boo ret = mapping->a_ops->readpage(swap_file, page); if (!ret) count_vm_event(PSWPIN); - return ret; + goto out; } ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); @@ -382,7 +387,7 @@ int swap_readpage(struct page *page, boo } count_vm_event(PSWPIN); - return 0; + goto out; } ret = 0; @@ -416,8 +421,8 @@ int swap_readpage(struct page *page, boo } __set_current_state(TASK_RUNNING); bio_put(bio); - out: + psi_memstall_leave(&pflags); return ret; } --- a/mm/readahead.c~psi-annotate-refault-stalls-from-io-submission +++ a/mm/readahead.c @@ -22,6 +22,7 @@ #include <linux/mm_inline.h> #include <linux/blk-cgroup.h> #include <linux/fadvise.h> +#include <linux/psi.h> #include "internal.h" @@ -92,6 +93,9 @@ int read_cache_pages(struct address_spac int ret = 0; while (!list_empty(pages)) { + unsigned long pflags; + bool refault; + page = lru_to_page(pages); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -101,7 +105,15 @@ int read_cache_pages(struct address_spac } put_page(page); + refault = PageWorkingset(page); + if (refault) + psi_memstall_enter(&pflags); + ret = filler(data, page); + + if (refault) + psi_memstall_leave(&pflags); + if (unlikely(ret)) { read_cache_pages_invalidate_pages(mapping, pages); break; @@ -132,8 +144,18 @@ static int read_pages(struct address_spa for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = lru_to_page(pages); list_del(&page->lru); - if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) + if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { + bool refault = PageWorkingset(page); + unsigned long pflags; + + if (refault) + psi_memstall_enter(&pflags); + mapping->a_ops->readpage(filp, page); + + if (refault) + psi_memstall_leave(&pflags); + } put_page(page); } ret = 0; _ Patches currently in -mm which might be from hannes@xxxxxxxxxxx are block-annotate-refault-stalls-from-io-submission.patch