On Thu, Sep 25, 2014 at 04:33:17PM -0400, Matthew Wilcox wrote: ... For those who want to see what changed beteen v10 and v11, here are the patches that I applied to my internal git tree.
>From e41949f26f9cc492aab17a1b94d030a11c020893 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Wed, 10 Sep 2014 13:19:22 -0400 Subject: [PATCH 1/7] dax: A couple of fixes from Dave Chinner If dax_clear_blocks() returns an error, segfault. Don't bother calling get_block() again if the BH is unwritten; the block is already allocated, and this won't help matters. Call b_end_io() if it's set, after zeroing the block, enabling the fs to convert the block from unwritten to written. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- fs/dax.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index bdf6622..90418ca 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -327,7 +327,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock_page; - if (!buffer_written(&bh) && !vmf->cow_page) { + if (!buffer_mapped(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); @@ -364,8 +364,13 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_LOCKED; } - if (buffer_unwritten(&bh) || buffer_new(&bh)) - dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); + if (buffer_unwritten(&bh) || buffer_new(&bh)) { + error = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); + if (error) + goto out; + if (bh.b_end_io) + bh.b_end_io(&bh, 1); + } /* Check we didn't race with a read fault installing a new page */ if (!page && major) -- 2.1.0
>From 9c9739b5942dca1e9238631c1bed48f1b21d8b63 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Thu, 11 Sep 2014 12:42:47 -0400 Subject: [PATCH 2/7] dax: Missing unlock in error path If the file was truncated, we have to drop the i_mmap_mutex before returning an error. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dax.c b/fs/dax.c index 90418ca..fabe9da 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -357,6 +357,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) { + mutex_unlock(&mapping->i_mmap_mutex); error = -EIO; goto out; } -- 2.1.0
>From ea8e4473e479bbf66a1caa956214b101b6845855 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Thu, 11 Sep 2014 12:44:20 -0400 Subject: [PATCH 3/7] dax: Must hold mutex while clearing blocks The i_mmap_mutex was not being held across the call to dax_clear_blocks(). That made it possible for a truncate racing with the page fault to have removed the blocks from the file before the call to dax_clear_blocks(). If the blocks had been reassigned to some other purpose, dax_clear_blocks() could end up clearing blocks that had somebody else's data in them. dax_do_fault() is getting a little long, so bundle up all this code into a new dax_insert_mapping() function. Call clear_page() instead of dax_clear_blocks(), since we know we're only clearing a single page. And use bdev_direct_access() instead of dax_get_pfn() since we actually want both the pfn (for inserting the map) and the address (for clearing the memory). Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- fs/dax.c | 87 ++++++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index fabe9da..b130b47 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -68,14 +68,6 @@ static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits) return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); } -static long dax_get_pfn(struct buffer_head *bh, unsigned long *pfn, - unsigned blkbits) -{ - void *addr; - sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, &addr, pfn, bh->b_size); -} - static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos, loff_t end) { @@ -283,6 +275,54 @@ static int copy_user_bh(struct page *to, struct buffer_head *bh, return 0; } +static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct address_space *mapping = inode->i_mapping; + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + unsigned long vaddr = (unsigned long)vmf->virtual_address; + void *addr; + unsigned long pfn; + pgoff_t size; + int error; + + mutex_lock(&mapping->i_mmap_mutex); + + /* + * Check truncate didn't happen while we were allocating a block. + * If it did, this block may or may not be still allocated to the + * file. We can't tell the filesystem to free it because we can't + * take i_mutex here. In the worst case, the file still has blocks + * allocated past the end of the file. + */ + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (unlikely(vmf->pgoff >= size)) { + error = -EIO; + goto out; + } + + error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); + if (error < 0) + goto out; + if (error < PAGE_SIZE) { + error = -EIO; + goto out; + } + + if (buffer_unwritten(bh) || buffer_new(bh)) { + clear_page(addr); + if (bh->b_end_io) + bh->b_end_io(bh, 1); + } + + error = vm_insert_mixed(vma, vaddr, pfn); + + out: + mutex_unlock(&mapping->i_mmap_mutex); + + return error; +} + static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, get_block_t get_block) { @@ -295,7 +335,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, unsigned blkbits = inode->i_blkbits; sector_t block; pgoff_t size; - unsigned long pfn; int error; int major = 0; @@ -365,14 +404,6 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, return VM_FAULT_LOCKED; } - if (buffer_unwritten(&bh) || buffer_new(&bh)) { - error = dax_clear_blocks(inode, bh.b_blocknr, bh.b_size); - if (error) - goto out; - if (bh.b_end_io) - bh.b_end_io(&bh, 1); - } - /* Check we didn't race with a read fault installing a new page */ if (!page && major) page = find_lock_page(mapping, vmf->pgoff); @@ -385,27 +416,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } - mutex_lock(&mapping->i_mmap_mutex); - - /* - * Check truncate didn't happen while we were allocating a block. - * If it did, this block may or may not be still allocated to the - * file. We can't tell the filesystem to free it because we can't - * take i_mutex here. In the worst case, the file still has blocks - * allocated past the end of the file. - */ - size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (unlikely(vmf->pgoff >= size)) { - mutex_unlock(&mapping->i_mmap_mutex); - error = -EIO; - goto out; - } - - error = dax_get_pfn(&bh, &pfn, blkbits); - if (error > 0) - error = vm_insert_mixed(vma, vaddr, pfn); - - mutex_unlock(&mapping->i_mmap_mutex); + error = dax_insert_mapping(inode, &bh, vma, vmf); out: if (error == -ENOMEM) -- 2.1.0
>From 9daf54382b53f3cffc3f050d75edf43e3c51efb4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Wed, 24 Sep 2014 13:53:24 -0400 Subject: [PATCH 4/7] dax: Unwritten extents don't set the mapped flag Despite an unwritten extent having a defined mapping, buffer_mapped() returns false. We don't need to call get_block() again here, since we know wat the disk block is that corresponds to this file offset. --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index b130b47..59be664 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -366,7 +366,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock_page; - if (!buffer_mapped(&bh) && !vmf->cow_page) { + if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); -- 2.1.0
>From 96f051597cfd91fe51a30fc3dbdeed290b98d7fe Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Wed, 24 Sep 2014 14:02:38 -0400 Subject: [PATCH 5/7] ext4: Add a callback to convert unwritten extents A different bug was masking the problem that unwritten extents need to be converted to written extents once we've faulted them into existence. Following the XFS example, add a b_end_io callback. We "borrow" a few additional fields in the buffer_head, but there aren't any big enough for a sector_t. Fortunately, we only use this callback for DAX, and ext4 already requires a 4k block size for using DAX, which puts the limit at 16TB. The page cache already limits file sizes to 16TB on 32-bit systems, so we don't need to grow any fields. --- fs/ext4/inode.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5edd903..eaa293a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -676,6 +676,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -713,6 +725,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; -- 2.1.0
>From b73ccea0e0bb4f09fa4ad0a4fa20f6d346bedf50 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Wed, 24 Sep 2014 14:08:40 -0400 Subject: [PATCH 6/7] vfs: Prevent DAX I/Os from falling back to buffered I/O Unlike regular direct I/O, DAX will handle file holes, and there is no desire to fall back to buffered I/O. Buffered I/O ought to fail if DAX I/O fails, unless we're doing a random-failure test. So skip the buffered I/O attempts for DAX files. --- mm/filemap.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 19bdb68..e69b586 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1717,9 +1717,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * we've already read everything we wanted to, or if * there was a short read because we hit EOF, go ahead * and return. Otherwise fallthrough to buffered io for - * the rest of the read. + * the rest of the read. Buffered reads will not work for + * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) { + if (retval < 0 || !iov_iter_count(iter) || *ppos >= size || + IS_DAX(inode)) { file_accessed(file); goto out; } @@ -2582,13 +2584,16 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; written = generic_file_direct_write(iocb, from, pos); - if (written < 0 || written == count) - goto out; - /* - * direct-io write to a hole: fall through to buffered I/O - * for completing the rest of the request. + * If the write stopped short of completing, fall back to + * buffered writes. Some filesystems do this for writes to + * holes, for example. For DAX files, a buffered write will + * not succeed (even if it did, DAX does not handle dirty + * page-cache pages correctly). */ + if (written < 0 || written == count || IS_DAX(inode)) + goto out; + pos += written; count -= written; -- 2.1.0
>From 953153c75704a3954052b68c63b4cc3ad15b7a06 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> Date: Wed, 24 Sep 2014 14:27:57 -0400 Subject: [PATCH 7/7] dax: Call b_end_io outside the i_mmap_mutex Lockdep helpfully points out that ext4_convert_unwritten_extents will start a jbd transaction, and i_mmap_mutex is already nested inside the jbd2_handle lock. So move the call to b_end_io to after we drop the i_mmap_mutex; this only extends the window in which we can crash and the allocation will be lost; it does not introduce any new races. --- fs/dax.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 59be664..91b7561 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -309,17 +309,17 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - if (buffer_unwritten(bh) || buffer_new(bh)) { + if (buffer_unwritten(bh) || buffer_new(bh)) clear_page(addr); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - } error = vm_insert_mixed(vma, vaddr, pfn); out: mutex_unlock(&mapping->i_mmap_mutex); + if (bh->b_end_io) + bh->b_end_io(bh, 1); + return error; } -- 2.1.0