Convert XFS to use the new iomap based multipage write path. This involves implementing the ->iomap_begin and ->iomap_end methods, and switching the buffered file write, page_mkwrite and xfs_iozero paths to the new iomap helpers. With this change __xfs_get_blocks will never be used for buffered writes, and the code handling them can be removed. Based on earlier code from Dave Chinner. Signed-off-by: Christoph Hellwig <hch@xxxxxx> Reviewed-by: Bob Peterson <rpeterso@xxxxxxxxxx> --- fs/xfs/Kconfig | 1 + fs/xfs/xfs_aops.c | 212 ----------------------------------------------------- fs/xfs/xfs_file.c | 71 ++++++++---------- fs/xfs/xfs_iomap.c | 144 ++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_iomap.h | 5 +- fs/xfs/xfs_iops.c | 9 ++- fs/xfs/xfs_trace.h | 3 + 7 files changed, 187 insertions(+), 258 deletions(-) diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 5d47b4d..35faf12 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -4,6 +4,7 @@ config XFS_FS depends on (64BIT || LBDAF) select EXPORTFS select LIBCRC32C + select FS_IOMAP help XFS is a high performance journaling filesystem which originated on the SGI IRIX platform. It is completely multi-threaded, can diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4c463b9..2ac9f7e 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1427,216 +1427,6 @@ xfs_vm_direct_IO( xfs_get_blocks_direct, endio, NULL, flags); } -/* - * Punch out the delalloc blocks we have already allocated. - * - * Don't bother with xfs_setattr given that nothing can have made it to disk yet - * as the page is still locked at this point. - */ -STATIC void -xfs_vm_kill_delalloc_range( - struct inode *inode, - loff_t start, - loff_t end) -{ - struct xfs_inode *ip = XFS_I(inode); - xfs_fileoff_t start_fsb; - xfs_fileoff_t end_fsb; - int error; - - start_fsb = XFS_B_TO_FSB(ip->i_mount, start); - end_fsb = XFS_B_TO_FSB(ip->i_mount, end); - if (end_fsb <= start_fsb) - return; - - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, start_fsb, - end_fsb - start_fsb); - if (error) { - /* something screwed, just bail */ - if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_alert(ip->i_mount, - "xfs_vm_write_failed: unable to clean up ino %lld", - ip->i_ino); - } - } - xfs_iunlock(ip, XFS_ILOCK_EXCL); -} - -STATIC void -xfs_vm_write_failed( - struct inode *inode, - struct page *page, - loff_t pos, - unsigned len) -{ - loff_t block_offset; - loff_t block_start; - loff_t block_end; - loff_t from = pos & (PAGE_SIZE - 1); - loff_t to = from + len; - struct buffer_head *bh, *head; - struct xfs_mount *mp = XFS_I(inode)->i_mount; - - /* - * The request pos offset might be 32 or 64 bit, this is all fine - * on 64-bit platform. However, for 64-bit pos request on 32-bit - * platform, the high 32-bit will be masked off if we evaluate the - * block_offset via (pos & PAGE_MASK) because the PAGE_MASK is - * 0xfffff000 as an unsigned long, hence the result is incorrect - * which could cause the following ASSERT failed in most cases. - * In order to avoid this, we can evaluate the block_offset of the - * start of the page by using shifts rather than masks the mismatch - * problem. - */ - block_offset = (pos >> PAGE_SHIFT) << PAGE_SHIFT; - - ASSERT(block_offset + from == pos); - - head = page_buffers(page); - block_start = 0; - for (bh = head; bh != head || !block_start; - bh = bh->b_this_page, block_start = block_end, - block_offset += bh->b_size) { - block_end = block_start + bh->b_size; - - /* skip buffers before the write */ - if (block_end <= from) - continue; - - /* if the buffer is after the write, we're done */ - if (block_start >= to) - break; - - /* - * Process delalloc and unwritten buffers beyond EOF. We can - * encounter unwritten buffers in the event that a file has - * post-EOF unwritten extents and an extending write happens to - * fail (e.g., an unaligned write that also involves a delalloc - * to the same page). - */ - if (!buffer_delay(bh) && !buffer_unwritten(bh)) - continue; - - if (!xfs_mp_fail_writes(mp) && !buffer_new(bh) && - block_offset < i_size_read(inode)) - continue; - - if (buffer_delay(bh)) - xfs_vm_kill_delalloc_range(inode, block_offset, - block_offset + bh->b_size); - - /* - * This buffer does not contain data anymore. make sure anyone - * who finds it knows that for certain. - */ - clear_buffer_delay(bh); - clear_buffer_uptodate(bh); - clear_buffer_mapped(bh); - clear_buffer_new(bh); - clear_buffer_dirty(bh); - clear_buffer_unwritten(bh); - } - -} - -/* - * This used to call block_write_begin(), but it unlocks and releases the page - * on error, and we need that page to be able to punch stale delalloc blocks out - * on failure. hence we copy-n-waste it here and call xfs_vm_write_failed() at - * the appropriate point. - */ -STATIC int -xfs_vm_write_begin( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned flags, - struct page **pagep, - void **fsdata) -{ - pgoff_t index = pos >> PAGE_SHIFT; - struct page *page; - int status; - struct xfs_mount *mp = XFS_I(mapping->host)->i_mount; - - ASSERT(len <= PAGE_SIZE); - - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; - - status = __block_write_begin(page, pos, len, xfs_get_blocks); - if (xfs_mp_fail_writes(mp)) - status = -EIO; - if (unlikely(status)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - - xfs_vm_write_failed(inode, page, pos, len); - unlock_page(page); - - /* - * If the write is beyond EOF, we only want to kill blocks - * allocated in this write, not blocks that were previously - * written successfully. - */ - if (xfs_mp_fail_writes(mp)) - isize = 0; - if (pos + len > isize) { - ssize_t start = max_t(ssize_t, pos, isize); - - truncate_pagecache_range(inode, start, pos + len); - } - - put_page(page); - page = NULL; - } - - *pagep = page; - return status; -} - -/* - * On failure, we only need to kill delalloc blocks beyond EOF in the range of - * this specific write because they will never be written. Previous writes - * beyond EOF where block allocation succeeded do not need to be trashed, so - * only new blocks from this write should be trashed. For blocks within - * EOF, generic_write_end() zeros them so they are safe to leave alone and be - * written with all the other valid data. - */ -STATIC int -xfs_vm_write_end( - struct file *file, - struct address_space *mapping, - loff_t pos, - unsigned len, - unsigned copied, - struct page *page, - void *fsdata) -{ - int ret; - - ASSERT(len <= PAGE_SIZE); - - ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (unlikely(ret < len)) { - struct inode *inode = mapping->host; - size_t isize = i_size_read(inode); - loff_t to = pos + len; - - if (to > isize) { - /* only kill blocks in this write beyond EOF */ - if (pos > isize) - isize = pos; - xfs_vm_kill_delalloc_range(inode, isize, to); - truncate_pagecache_range(inode, isize, to); - } - } - return ret; -} - STATIC sector_t xfs_vm_bmap( struct address_space *mapping, @@ -1747,8 +1537,6 @@ const struct address_space_operations xfs_address_space_operations = { .set_page_dirty = xfs_vm_set_page_dirty, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, - .write_begin = xfs_vm_write_begin, - .write_end = xfs_vm_write_end, .bmap = xfs_vm_bmap, .direct_IO = xfs_vm_direct_IO, .migratepage = buffer_migrate_page, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 47fc632..7316d38 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,6 +37,7 @@ #include "xfs_log.h" #include "xfs_icache.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/dcache.h> #include <linux/falloc.h> @@ -79,57 +80,27 @@ xfs_rw_ilock_demote( inode_unlock(VFS_I(ip)); } -/* - * xfs_iozero clears the specified range supplied via the page cache (except in - * the DAX case). Writes through the page cache will allocate blocks over holes, - * though the callers usually map the holes first and avoid them. If a block is - * not completely zeroed, then it will be read from disk before being partially - * zeroed. - * - * In the DAX case, we can just directly write to the underlying pages. This - * will not allocate blocks, but will avoid holes and unwritten extents and so - * not do unnecessary work. - */ -int -xfs_iozero( - struct xfs_inode *ip, /* inode */ - loff_t pos, /* offset in file */ - size_t count) /* size of data to zero */ +static int +xfs_dax_zero_range( + struct inode *inode, + loff_t pos, + size_t count) { - struct page *page; - struct address_space *mapping; int status = 0; - - mapping = VFS_I(ip)->i_mapping; do { unsigned offset, bytes; - void *fsdata; offset = (pos & (PAGE_SIZE -1)); /* Within page */ bytes = PAGE_SIZE - offset; if (bytes > count) bytes = count; - if (IS_DAX(VFS_I(ip))) { - status = dax_zero_page_range(VFS_I(ip), pos, bytes, - xfs_get_blocks_direct); - if (status) - break; - } else { - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; - - zero_user(page, offset, bytes); + status = dax_zero_page_range(inode, pos, bytes, + xfs_get_blocks_direct); + if (status) + break; - status = pagecache_write_end(NULL, mapping, pos, bytes, - bytes, page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ - status = 0; - } pos += bytes; count -= bytes; } while (count); @@ -137,6 +108,24 @@ xfs_iozero( return status; } +/* + * Clear the specified ranges to zero through either the pagecache or DAX. + * Holes and unwritten extents will be left as-is as they already are zeroed. + */ +int +xfs_iozero( + struct xfs_inode *ip, + loff_t pos, + size_t count) +{ + struct inode *inode = VFS_I(ip); + + if (IS_DAX(VFS_I(ip))) + return xfs_dax_zero_range(inode, pos, count); + else + return iomap_zero_range(inode, pos, count, NULL, &xfs_iomap_ops); +} + int xfs_update_prealloc_flags( struct xfs_inode *ip, @@ -841,7 +830,7 @@ xfs_file_buffered_aio_write( write_retry: trace_xfs_file_buffered_write(ip, iov_iter_count(from), iocb->ki_pos, 0); - ret = generic_perform_write(file, from, iocb->ki_pos); + ret = iomap_file_buffered_write(iocb, from, &xfs_iomap_ops); if (likely(ret >= 0)) iocb->ki_pos += ret; @@ -1553,7 +1542,7 @@ xfs_filemap_page_mkwrite( if (IS_DAX(inode)) { ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault); } else { - ret = block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops); ret = block_page_mkwrite_return(ret); } diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 2f37194..620fc91 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -967,3 +967,147 @@ xfs_bmbt_to_iomap( iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount); iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip)); } + +static inline bool imap_needs_alloc(struct xfs_bmbt_irec *imap, int nimaps) +{ + return !nimaps || + imap->br_startblock == HOLESTARTBLOCK || + imap->br_startblock == DELAYSTARTBLOCK; +} + +static int +xfs_file_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + struct xfs_bmbt_irec imap; + xfs_fileoff_t offset_fsb, end_fsb; + int nimaps = 1, error = 0; + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + ASSERT(offset <= mp->m_super->s_maxbytes); + if ((xfs_fsize_t)offset + length > mp->m_super->s_maxbytes) + length = mp->m_super->s_maxbytes - offset; + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap, + &nimaps, XFS_BMAPI_ENTIRE); + if (error) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; + } + + if ((flags & IOMAP_WRITE) && imap_needs_alloc(&imap, nimaps)) { + /* + * We cap the maximum length we map here to MAX_WRITEBACK_PAGES + * pages to keep the chunks of work done where somewhat symmetric + * with the work writeback does. This is a completely arbitrary + * number pulled out of thin air as a best guess for initial + * testing. + * + * Note that the values needs to be less than 32-bits wide until + * the lower level functions are updated. + */ + length = min_t(loff_t, length, 1024 * PAGE_SIZE); + if (xfs_get_extsz_hint(ip)) { + /* + * xfs_iomap_write_direct() expects the shared lock. It + * is unlocked on return. + */ + xfs_ilock_demote(ip, XFS_ILOCK_EXCL); + error = xfs_iomap_write_direct(ip, offset, length, &imap, + nimaps); + } else { + error = xfs_iomap_write_delay(ip, offset, length, &imap); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + } + + if (error) + return error; + + trace_xfs_iomap_alloc(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else if (nimaps) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_found(ip, offset, length, 0, &imap); + xfs_bmbt_to_iomap(ip, iomap, &imap); + } else { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + trace_xfs_iomap_not_found(ip, offset, length, 0, &imap); + iomap->blkno = IOMAP_NULL_BLOCK; + iomap->type = IOMAP_HOLE; + iomap->offset = offset; + iomap->length = length; + } + + return 0; +} + +static int +xfs_file_iomap_end_delalloc( + struct xfs_inode *ip, + loff_t offset, + loff_t length, + ssize_t written) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t start_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + + start_fsb = XFS_B_TO_FSB(mp, offset + written); + end_fsb = XFS_B_TO_FSB(mp, offset + length); + + /* + * Trim back delalloc blocks if we didn't manage to write the whole + * range reserved. + * + * We don't need to care about racing delalloc as we hold i_mutex + * across the reserve/allocate/unreserve calls. If there are delalloc + * blocks in the range, they are ours. + */ + if (start_fsb < end_fsb) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_bmap_punch_delalloc_range(ip, start_fsb, + end_fsb - start_fsb); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (error && !XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert(mp, "%s: unable to clean up ino %lld", + __func__, ip->i_ino); + return error; + } + } + + return 0; +} + +static int +xfs_file_iomap_end( + struct inode *inode, + loff_t offset, + loff_t length, + ssize_t written, + unsigned flags, + struct iomap *iomap) +{ + if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) + return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, + length, written); + return 0; +} + +struct iomap_ops xfs_iomap_ops = { + .iomap_begin = xfs_file_iomap_begin, + .iomap_end = xfs_file_iomap_end, +}; diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 718f07c..e066d04 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -18,7 +18,8 @@ #ifndef __XFS_IOMAP_H__ #define __XFS_IOMAP_H__ -struct iomap; +#include <linux/iomap.h> + struct xfs_inode; struct xfs_bmbt_irec; @@ -33,4 +34,6 @@ int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t); void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *, struct xfs_bmbt_irec *); +extern struct iomap_ops xfs_iomap_ops; + #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 1a5ca4b..5d1fdae 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -38,6 +38,7 @@ #include "xfs_dir2.h" #include "xfs_trans_space.h" #include "xfs_pnfs.h" +#include "xfs_iomap.h" #include <linux/capability.h> #include <linux/xattr.h> @@ -822,8 +823,8 @@ xfs_setattr_size( error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); } else { - error = block_truncate_page(inode->i_mapping, newsize, - xfs_get_blocks); + error = iomap_truncate_page(inode, newsize, + &did_zeroing, &xfs_iomap_ops); } } @@ -838,8 +839,8 @@ xfs_setattr_size( * problem. Note that this includes any block zeroing we did above; * otherwise those blocks may not be zeroed after a crash. */ - if (newsize > ip->i_d.di_size && - (oldsize != ip->i_d.di_size || did_zeroing)) { + if (did_zeroing || + (newsize > ip->i_d.di_size && oldsize != ip->i_d.di_size)) { error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ip->i_d.di_size, newsize); if (error) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index ea94ee0..bb24ce7 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1295,6 +1295,9 @@ DEFINE_IOMAP_EVENT(xfs_map_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_found); DEFINE_IOMAP_EVENT(xfs_get_blocks_alloc); DEFINE_IOMAP_EVENT(xfs_get_blocks_map_direct); +DEFINE_IOMAP_EVENT(xfs_iomap_alloc); +DEFINE_IOMAP_EVENT(xfs_iomap_found); +DEFINE_IOMAP_EVENT(xfs_iomap_not_found); DECLARE_EVENT_CLASS(xfs_simple_io_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), -- 2.1.4 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html