O_DIRECT flags can be toggeled via fcntl(F_SETFL). But this value checked twice inside ext4_file_write_iter() and __generic_file_write() which result in BUG_ON (see typical stack trace below) In order to fix this we have to use our own copy of __generic_file_write and pass o_direct status explicitly. TESTCASE: xfstest:generic/326 (http://patchwork.ozlabs.org/patch/397949/) #TYPICAL STACK TRACE: kernel BUG at fs/ext4/inode.c:2960! invalid opcode: 0000 [#1] SMP Modules linked in: brd iTCO_wdt lpc_ich mfd_core igb ptp dm_mirror dm_region_hash dm_log dm_mod CPU: 6 PID: 5505 Comm: aio-dio-fcntl-r Not tainted 3.17.0-rc2-00176-gff5c017 #161 Hardware name: Intel Corporation W2600CR/W2600CR, BIOS SE5C600.86B.99.99.x028.061320111235 06/13/2011 task: ffff88080e95a7c0 ti: ffff88080f908000 task.ti: ffff88080f908000 RIP: 0010:[<ffffffff811fabf2>] [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0 RSP: 0018:ffff88080f90bb58 EFLAGS: 00010246 RAX: 0000000000000400 RBX: ffff88080fdb2a28 RCX: 00000000a802c818 RDX: 0000040000080000 RSI: ffff88080d8aeb80 RDI: 0000000000000001 RBP: ffff88080f90bbc8 R08: 0000000000000000 R09: 0000000000001581 R10: 0000000000000000 R11: 0000000000000000 R12: ffff88080d8aeb80 R13: ffff88080f90bbf8 R14: ffff88080fdb28c8 R15: ffff88080fdb2a28 FS: 00007f23b2055700(0000) GS:ffff880818400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f23b2045000 CR3: 000000080cedf000 CR4: 00000000000407e0 Stack: ffff88080f90bb98 0000000000000000 7ffffffffffffffe ffff88080fdb2c30 0000000000000200 0000000000000200 0000000000000001 0000000000000200 ffff88080f90bbc8 ffff88080fdb2c30 ffff88080f90be08 0000000000000200 Call Trace: [<ffffffff8112ca9d>] generic_file_direct_write+0xed/0x180 [<ffffffff8112f2b2>] __generic_file_write_iter+0x222/0x370 [<ffffffff811f495b>] ext4_file_write_iter+0x34b/0x400 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410 [<ffffffff811bd709>] ? aio_run_iocb+0x239/0x410 [<ffffffff810990e5>] ? local_clock+0x25/0x30 [<ffffffff810abd94>] ? __lock_acquire+0x274/0x700 [<ffffffff811f4610>] ? ext4_unwritten_wait+0xb0/0xb0 [<ffffffff811bd756>] aio_run_iocb+0x286/0x410 [<ffffffff810990e5>] ? local_clock+0x25/0x30 [<ffffffff810ac359>] ? lock_release_holdtime+0x29/0x190 [<ffffffff811bc05b>] ? lookup_ioctx+0x4b/0xf0 [<ffffffff811bde3b>] do_io_submit+0x55b/0x740 [<ffffffff811bdcaa>] ? do_io_submit+0x3ca/0x740 [<ffffffff811be030>] SyS_io_submit+0x10/0x20 [<ffffffff815ce192>] system_call_fastpath+0x16/0x1b Code: 01 48 8b 80 f0 01 00 00 48 8b 18 49 8b 45 10 0f 85 f1 01 00 00 48 03 45 c8 48 3b 43 48 0f 8f e3 01 00 00 49 83 7c 24 18 00 75 04 <0f> 0b eb fe f0 ff 83 ec 01 00 00 49 8b 44 24 18 8b 00 85 c0 89 RIP [<ffffffff811fabf2>] ext4_direct_IO+0x162/0x3d0 RSP <ffff88080f90bb58> Reported-by: Sasha Levin <sasha.levin@xxxxxxxxxx> Signed-off-by: Dmitry Monakhov <dmonakhov@xxxxxxxxxx> --- fs/ext4/file.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 95 insertions(+), 1 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index aca7b24..8477eb2 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -88,6 +88,100 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos) return 0; } +/** + * copy of __generic_file_write_iter with explicit O_DIRECT status + * @iocb: IO state structure (file, offset, etc.) + * @from: iov_iter with data to write + * @direct: perform O_DIRECT IO + */ +static ssize_t +__ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from, int direct) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + ssize_t err; + ssize_t status; + size_t count = iov_iter_count(from); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + iov_iter_truncate(from, count); + + err = file_remove_suid(file); + if (err) + goto out; + + err = file_update_time(file); + if (err) + goto out; + + /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */ + if (unlikely(direct)) { + loff_t endbyte; + + written = generic_file_direct_write(iocb, from, pos); + if (written < 0 || written == count) + goto out; + + /* + * direct-io write to a hole: fall through to buffered I/O + * for completing the rest of the request. + */ + pos += written; + count -= written; + + status = generic_perform_write(file, from, pos); + /* + * If generic_perform_write() returned a synchronous error + * then we want to return the number of bytes which were + * direct-written, or the error code if that was zero. Note + * that this differs from normal direct-io semantics, which + * will return -EFOO even if some bytes were written. + */ + if (unlikely(status < 0)) { + err = status; + goto out; + } + iocb->ki_pos = pos + status; + /* + * We need to ensure that the page cache pages are written to + * disk and invalidated to preserve the expected O_DIRECT + * semantics. + */ + endbyte = pos + status - 1; + err = filemap_write_and_wait_range(file->f_mapping, pos, + endbyte); + if (err == 0) { + written += status; + invalidate_mapping_pages(mapping, + pos >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } else { + /* + * We don't know how much we wrote, so just return + * the number of bytes which were direct-written + */ + } + } else { + written = generic_perform_write(file, from, pos); + if (likely(written >= 0)) + iocb->ki_pos = pos + written; + } +out: + current->backing_dev_info = NULL; + return written ? written : err; +} + static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { @@ -172,7 +266,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } } - ret = __generic_file_write_iter(iocb, from); + ret = __ext4_file_write_iter(iocb, from, o_direct); mutex_unlock(&inode->i_mutex); if (ret > 0) { -- 1.7.1 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html