This is a note to let you know that I've just added the patch titled ext4: fix races between page faults and hole punching to the 4.4-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: ext4-fix-races-between-page-faults-and-hole-punching.patch and it can be found in the queue-4.4 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. >From ea3d7209ca01da209cda6f0dea8be9cc4b7a933b Mon Sep 17 00:00:00 2001 From: Jan Kara <jack@xxxxxxxx> Date: Mon, 7 Dec 2015 14:28:03 -0500 Subject: ext4: fix races between page faults and hole punching From: Jan Kara <jack@xxxxxxxx> commit ea3d7209ca01da209cda6f0dea8be9cc4b7a933b upstream. Currently, page faults and hole punching are completely unsynchronized. This can result in page fault faulting in a page into a range that we are punching after truncate_pagecache_range() has been called and thus we can end up with a page mapped to disk blocks that will be shortly freed. Filesystem corruption will shortly follow. Note that the same race is avoided for truncate by checking page fault offset against i_size but there isn't similar mechanism available for punching holes. Fix the problem by creating new rw semaphore i_mmap_sem in inode and grab it for writing over truncate, hole punching, and other functions removing blocks from extent tree and for read over page faults. We cannot easily use i_data_sem for this since that ranks below transaction start and we need something ranking above it so that it can be held over the whole truncate / hole punching operation. Also remove various workarounds we had in the code to reduce race window when page fault could have created pages with stale mapping information. Signed-off-by: Jan Kara <jack@xxxxxxxx> Signed-off-by: Theodore Ts'o <tytso@xxxxxxx> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx> --- fs/ext4/ext4.h | 10 ++++++++ fs/ext4/extents.c | 54 ++++++++++++++++++++++++------------------- fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++++++++++-------- fs/ext4/inode.c | 36 +++++++++++++++++++++------- fs/ext4/super.c | 1 fs/ext4/truncate.h | 2 + 6 files changed, 127 insertions(+), 42 deletions(-) --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -933,6 +933,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -2507,6 +2516,7 @@ extern int ext4_chunk_trans_blocks(struc extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -4770,7 +4770,6 @@ static long ext4_zero_range(struct file int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4785,6 @@ static long ext4_zero_range(struct file } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4856,16 +4844,22 @@ static long ext4_zero_range(struct file flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ - truncate_pagecache_range(inode, start, end - 1); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* Now release the pages and zero block aligned part of pages */ + truncate_pagecache_range(inode, start, end - 1); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -5524,17 +5518,22 @@ int ext4_collapse_range(struct inode *in goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,7 +5572,8 @@ int ext4_collapse_range(struct inode *in out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -5660,17 +5660,22 @@ int ext4_insert_range(struct inode *inod goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5741,7 +5746,8 @@ int ext4_insert_range(struct inode *inod out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -209,15 +209,18 @@ static int ext4_dax_fault(struct vm_area { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -228,8 +231,10 @@ static int ext4_dax_fault(struct vm_area if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,10 +251,12 @@ static int ext4_dax_pmd_fault(struct vm_ if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; @@ -260,30 +267,71 @@ static int ext4_dax_pmd_fault(struct vm_ if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + int err; + struct inode *inode = file_inode(vma->vm_file); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + err = __dax_mkwrite(vma, vmf, ext4_get_block_dax, + ext4_end_io_unwritten); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(inode->i_sb); + + return err; +} + +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + int ret = VM_FAULT_NOPAGE; + loff_t size; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3651,6 +3651,15 @@ int ext4_punch_hole(struct inode *inode, } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; @@ -3659,10 +3668,6 @@ int ext4_punch_hole(struct inode *inode, truncate_pagecache_range(inode, first_block_offset, last_block_offset); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); else @@ -3708,16 +3713,12 @@ int ext4_punch_hole(struct inode *inode, if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: mutex_unlock(&inode->i_mutex); @@ -4851,6 +4852,7 @@ int ext4_setattr(struct dentry *dentry, } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4858,6 +4860,7 @@ int ext4_setattr(struct dentry *dentry, truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5306,6 +5309,8 @@ int ext4_page_mkwrite(struct vm_area_str sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5375,6 +5380,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -958,6 +958,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* Patches currently in stable-queue which might be from jack@xxxxxxxx are queue-4.4/ext4-fix-races-between-page-faults-and-hole-punching.patch queue-4.4/ext4-move-unlocked-dio-protection-from-ext4_alloc_file_blocks.patch queue-4.4/ext4-fix-races-between-buffered-io-and-collapse-insert-range.patch queue-4.4/ext4-fix-races-of-writeback-with-punch-hole-and-zero-range.patch -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html