Instead of implementing our own writeback clustering use write_cache_pages to do it for us. This means the guts of the current writepage implementation become a new helper used both for implementing ->writepage and as a callback to write_cache_pages for ->writepages. A new struct xfs_writeback_ctx is used to track block mapping state and the ioend chain over multiple invocation of it. The advantage over the old code is that we avoid a double pagevec lookup, and a more efficient handling of extent boundaries inside a page for small blocksize filesystems, as well as having less XFS specific code. The downside is that we don't do writeback clustering when called from kswapd anyore, but that is a case that should be avoided anyway. Note that we still convert the whole delalloc range from ->writepage, so the on-disk allocation pattern is not affected. Signed-off-by: Christoph Hellwig <hch@xxxxxx> Index: linux-2.6/fs/xfs/linux-2.6/xfs_aops.c =================================================================== --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_aops.c 2011-04-27 20:55:01.482820427 +0200 +++ linux-2.6/fs/xfs/linux-2.6/xfs_aops.c 2011-04-28 11:22:42.747447011 +0200 @@ -38,6 +38,12 @@ #include <linux/pagevec.h> #include <linux/writeback.h> +struct xfs_writeback_ctx { + unsigned int imap_valid; + struct xfs_bmbt_irec imap; + struct xfs_ioend *iohead; + struct xfs_ioend *ioend; +}; /* * Prime number of hash buckets since address is used as the key. @@ -487,6 +493,7 @@ xfs_submit_ioend( struct buffer_head *bh; struct bio *bio; sector_t lastblock = 0; + struct blk_plug plug; /* Pass 1 - start writeback */ do { @@ -496,6 +503,7 @@ xfs_submit_ioend( } while ((ioend = next) != NULL); /* Pass 2 - submit I/O */ + blk_start_plug(&plug); ioend = head; do { next = ioend->io_list; @@ -522,6 +530,7 @@ xfs_submit_ioend( xfs_submit_ioend_bio(wbc, ioend, bio); xfs_finish_ioend(ioend); } while ((ioend = next) != NULL); + blk_finish_plug(&plug); } /* @@ -661,153 +670,6 @@ xfs_is_delayed_page( return 0; } -/* - * Allocate & map buffers for page given the extent map. Write it out. - * except for the original page of a writepage, this is called on - * delalloc/unwritten pages only, for the original page it is possible - * that the page has no mapping at all. - */ -STATIC int -xfs_convert_page( - struct inode *inode, - struct page *page, - loff_t tindex, - struct xfs_bmbt_irec *imap, - xfs_ioend_t **ioendp, - struct writeback_control *wbc) -{ - struct buffer_head *bh, *head; - xfs_off_t end_offset; - unsigned long p_offset; - unsigned int type; - int len, page_dirty; - int count = 0, done = 0, uptodate = 1; - xfs_off_t offset = page_offset(page); - - if (page->index != tindex) - goto fail; - if (!trylock_page(page)) - goto fail; - if (PageWriteback(page)) - goto fail_unlock_page; - if (page->mapping != inode->i_mapping) - goto fail_unlock_page; - if (!xfs_is_delayed_page(page, (*ioendp)->io_type)) - goto fail_unlock_page; - - /* - * page_dirty is initially a count of buffers on the page before - * EOF and is decremented as we move each into a cleanable state. - * - * Derivation: - * - * End offset is the highest offset that this page should represent. - * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1)) - * will evaluate non-zero and be less than PAGE_CACHE_SIZE and - * hence give us the correct page_dirty count. On any other page, - * it will be zero and in that case we need page_dirty to be the - * count of buffers on the page. - */ - end_offset = min_t(unsigned long long, - (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, - i_size_read(inode)); - - len = 1 << inode->i_blkbits; - p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1), - PAGE_CACHE_SIZE); - p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE; - page_dirty = p_offset / len; - - bh = head = page_buffers(page); - do { - if (offset >= end_offset) - break; - if (!buffer_uptodate(bh)) - uptodate = 0; - if (!(PageUptodate(page) || buffer_uptodate(bh))) { - done = 1; - continue; - } - - if (buffer_unwritten(bh) || buffer_delay(bh) || - buffer_mapped(bh)) { - if (buffer_unwritten(bh)) - type = IO_UNWRITTEN; - else if (buffer_delay(bh)) - type = IO_DELALLOC; - else - type = IO_OVERWRITE; - - if (!xfs_imap_valid(inode, imap, offset)) { - done = 1; - continue; - } - - lock_buffer(bh); - if (type != IO_OVERWRITE) - xfs_map_at_offset(inode, bh, imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, - ioendp, done); - - page_dirty--; - count++; - } else { - done = 1; - } - } while (offset += len, (bh = bh->b_this_page) != head); - - if (uptodate && bh == head) - SetPageUptodate(page); - - if (count) { - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) - done = 1; - } - xfs_start_page_writeback(page, !page_dirty, count); - - return done; - fail_unlock_page: - unlock_page(page); - fail: - return 1; -} - -/* - * Convert & write out a cluster of pages in the same extent as defined - * by mp and following the start page. - */ -STATIC void -xfs_cluster_write( - struct inode *inode, - pgoff_t tindex, - struct xfs_bmbt_irec *imap, - xfs_ioend_t **ioendp, - struct writeback_control *wbc, - pgoff_t tlast) -{ - struct pagevec pvec; - int done = 0, i; - - pagevec_init(&pvec, 0); - while (!done && tindex <= tlast) { - unsigned len = min_t(pgoff_t, PAGEVEC_SIZE, tlast - tindex + 1); - - if (!pagevec_lookup(&pvec, inode->i_mapping, tindex, len)) - break; - - for (i = 0; i < pagevec_count(&pvec); i++) { - done = xfs_convert_page(inode, pvec.pages[i], tindex++, - imap, ioendp, wbc); - if (done) - break; - } - - pagevec_release(&pvec); - cond_resched(); - } -} - STATIC void xfs_vm_invalidatepage( struct page *page, @@ -896,20 +758,20 @@ out_invalidate: * redirty the page. */ STATIC int -xfs_vm_writepage( +__xfs_vm_writepage( struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc, + void *data) { + struct xfs_writeback_ctx *ctx = data; struct inode *inode = page->mapping->host; struct buffer_head *bh, *head; - struct xfs_bmbt_irec imap; - xfs_ioend_t *ioend = NULL, *iohead = NULL; loff_t offset; unsigned int type; __uint64_t end_offset; pgoff_t end_index, last_index; ssize_t len; - int err, imap_valid = 0, uptodate = 1; + int err, uptodate = 1; int count = 0; trace_xfs_writepage(inode, page, 0); @@ -917,20 +779,6 @@ xfs_vm_writepage( ASSERT(page_has_buffers(page)); /* - * Refuse to write the page out if we are called from reclaim context. - * - * This avoids stack overflows when called from deeply used stacks in - * random callers for direct reclaim or memcg reclaim. We explicitly - * allow reclaim from kswapd as the stack usage there is relatively low. - * - * This should really be done by the core VM, but until that happens - * filesystems like XFS, btrfs and ext4 have to take care of this - * by themselves. - */ - if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) - goto redirty; - - /* * Given that we do not allow direct reclaim to call us we should * never be called while in a filesystem transaction. */ @@ -973,36 +821,38 @@ xfs_vm_writepage( * buffers covering holes here. */ if (!buffer_mapped(bh) && buffer_uptodate(bh)) { - imap_valid = 0; + ctx->imap_valid = 0; continue; } if (buffer_unwritten(bh)) { if (type != IO_UNWRITTEN) { type = IO_UNWRITTEN; - imap_valid = 0; + ctx->imap_valid = 0; } } else if (buffer_delay(bh)) { if (type != IO_DELALLOC) { type = IO_DELALLOC; - imap_valid = 0; + ctx->imap_valid = 0; } } else if (buffer_uptodate(bh)) { if (type != IO_OVERWRITE) { type = IO_OVERWRITE; - imap_valid = 0; + ctx->imap_valid = 0; } } else { if (PageUptodate(page)) { ASSERT(buffer_mapped(bh)); - imap_valid = 0; + ctx->imap_valid = 0; } continue; } - if (imap_valid) - imap_valid = xfs_imap_valid(inode, &imap, offset); - if (!imap_valid) { + if (ctx->imap_valid) { + ctx->imap_valid = + xfs_imap_valid(inode, &ctx->imap, offset); + } + if (!ctx->imap_valid) { /* * If we didn't have a valid mapping then we need to * put the new mapping into a separate ioend structure. @@ -1012,22 +862,25 @@ xfs_vm_writepage( * time. */ new_ioend = 1; - err = xfs_map_blocks(inode, offset, &imap, type); + err = xfs_map_blocks(inode, offset, &ctx->imap, type); if (err) goto error; - imap_valid = xfs_imap_valid(inode, &imap, offset); + ctx->imap_valid = + xfs_imap_valid(inode, &ctx->imap, offset); } - if (imap_valid) { + if (ctx->imap_valid) { lock_buffer(bh); - if (type != IO_OVERWRITE) - xfs_map_at_offset(inode, bh, &imap, offset); - xfs_add_to_ioend(inode, bh, offset, type, &ioend, + if (type != IO_OVERWRITE) { + xfs_map_at_offset(inode, bh, &ctx->imap, + offset); + } + xfs_add_to_ioend(inode, bh, offset, type, &ctx->ioend, new_ioend); count++; } - if (!iohead) - iohead = ioend; + if (!ctx->iohead) + ctx->iohead = ctx->ioend; } while (offset += len, ((bh = bh->b_this_page) != head)); @@ -1035,38 +888,9 @@ xfs_vm_writepage( SetPageUptodate(page); xfs_start_page_writeback(page, 1, count); - - if (ioend && imap_valid) { - xfs_off_t end_index; - - end_index = imap.br_startoff + imap.br_blockcount; - - /* to bytes */ - end_index <<= inode->i_blkbits; - - /* to pages */ - end_index = (end_index - 1) >> PAGE_CACHE_SHIFT; - - /* check against file size */ - if (end_index > last_index) - end_index = last_index; - - xfs_cluster_write(inode, page->index + 1, &imap, &ioend, - wbc, end_index); - } - - if (iohead) - xfs_submit_ioend(wbc, iohead); - return 0; error: - if (iohead) - xfs_cancel_ioend(iohead); - - if (err == -EAGAIN) - goto redirty; - xfs_aops_discard_page(page); ClearPageUptodate(page); unlock_page(page); @@ -1079,12 +903,62 @@ redirty: } STATIC int +xfs_vm_writepage( + struct page *page, + struct writeback_control *wbc) +{ + struct xfs_writeback_ctx ctx = { }; + int ret; + + /* + * Refuse to write the page out if we are called from reclaim context. + * + * This avoids stack overflows when called from deeply used stacks in + * random callers for direct reclaim or memcg reclaim. We explicitly + * allow reclaim from kswapd as the stack usage there is relatively low. + * + * This should really be done by the core VM, but until that happens + * filesystems like XFS, btrfs and ext4 have to take care of this + * by themselves. + */ + if ((current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + + ret = __xfs_vm_writepage(page, wbc, &ctx); + + if (ctx.iohead) { + if (ret) + xfs_cancel_ioend(ctx.iohead); + else + xfs_submit_ioend(wbc, ctx.iohead); + } + + return ret; +} + +STATIC int xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { + struct xfs_writeback_ctx ctx = { }; + int ret; + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return generic_writepages(mapping, wbc); + + ret = write_cache_pages(mapping, wbc, __xfs_vm_writepage, &ctx); + + if (ctx.iohead) { + if (ret) + xfs_cancel_ioend(ctx.iohead); + else + xfs_submit_ioend(wbc, ctx.iohead); + } + + return ret; } /* _______________________________________________ xfs mailing list xfs@xxxxxxxxxxx http://oss.sgi.com/mailman/listinfo/xfs