On Wed, Jun 22, 2016 at 05:27:15PM +0200, Christoph Hellwig wrote: > So far the DAX code overloaded the direct I/O code path. There is very little > in common between the two, and untangling them allows to clean up both variants. > > As a ѕide effect we also get separate trace points for both I/O types. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/xfs/xfs_file.c | 139 ++++++++++++++++++++++++++++++++++++++++++----------- > fs/xfs/xfs_trace.h | 2 + > 2 files changed, 112 insertions(+), 29 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index 24b267d..0e74325 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -262,13 +262,11 @@ xfs_file_dio_aio_read( > else > target = ip->i_mount->m_ddev_targp; > > - if (!IS_DAX(inode)) { > - /* DIO must be aligned to device logical sector size */ > - if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > - if (iocb->ki_pos == isize) > - return 0; > - return -EINVAL; > - } > + /* DIO must be aligned to device logical sector size */ > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) { > + if (iocb->ki_pos == isize) > + return 0; > + return -EINVAL; > } > > /* > @@ -317,13 +315,37 @@ xfs_file_dio_aio_read( > } > > data = *to; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - NULL, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, NULL, NULL, 0); > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, NULL, NULL, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(to, ret); > } > + xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); > + > + file_accessed(iocb->ki_filp); So I noticed that generic/323 starts crashing in file_accessed -> touch_atime because iocb->ki_filp->f_path.dentry == NULL. For a while I thought it was some weird reflink bug, but I finally had time to go build a vanilla 4.8-rc8 kernel and that blew up here too. I'm not sure why this line got inserted here, since it wasn't there prior to this patch, AFAICT. <shrug> --D > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_read( > + struct kiocb *iocb, > + struct iov_iter *to) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct iov_iter data = *to; > + size_t count = iov_iter_count(to); > + ssize_t ret = 0; > + > + trace_xfs_file_dax_read(ip, count, iocb->ki_pos); > + > + if (!count) > + return 0; /* skip atime */ > + > + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, NULL, 0); > if (ret > 0) { > iocb->ki_pos += ret; > iov_iter_advance(to, ret); > @@ -356,7 +378,8 @@ xfs_file_read_iter( > struct kiocb *iocb, > struct iov_iter *to) > { > - struct xfs_mount *mp = XFS_I(file_inode(iocb->ki_filp))->i_mount; > + struct inode *inode = file_inode(iocb->ki_filp); > + struct xfs_mount *mp = XFS_I(inode)->i_mount; > ssize_t ret = 0; > > XFS_STATS_INC(mp, xs_read_calls); > @@ -364,7 +387,9 @@ xfs_file_read_iter( > if (XFS_FORCED_SHUTDOWN(mp)) > return -EIO; > > - if (iocb->ki_flags & IOCB_DIRECT) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_read(iocb, to); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_read(iocb, to); > else > ret = xfs_file_buffered_aio_read(iocb, to); > @@ -586,8 +611,7 @@ xfs_file_dio_aio_write( > mp->m_rtdev_targp : mp->m_ddev_targp; > > /* DIO must be aligned to device logical sector size */ > - if (!IS_DAX(inode) && > - ((iocb->ki_pos | count) & target->bt_logical_sectormask)) > + if ((iocb->ki_pos | count) & target->bt_logical_sectormask) > return -EINVAL; > > /* "unaligned" here means not aligned to a filesystem block */ > @@ -656,14 +680,9 @@ xfs_file_dio_aio_write( > trace_xfs_file_direct_write(ip, count, iocb->ki_pos); > > data = *from; > - if (IS_DAX(inode)) { > - ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > - xfs_end_io_direct_write, 0); > - } else { > - ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > - xfs_get_blocks_direct, xfs_end_io_direct_write, > - NULL, DIO_ASYNC_EXTEND); > - } > + ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data, > + xfs_get_blocks_direct, xfs_end_io_direct_write, > + NULL, DIO_ASYNC_EXTEND); > > /* see generic_file_direct_write() for why this is necessary */ > if (mapping->nrpages) { > @@ -680,10 +699,70 @@ out: > xfs_rw_iunlock(ip, iolock); > > /* > - * No fallback to buffered IO on errors for XFS. DAX can result in > - * partial writes, but direct IO will either complete fully or fail. > + * No fallback to buffered IO on errors for XFS, direct IO will either > + * complete fully or fail. > + */ > + ASSERT(ret < 0 || ret == count); > + return ret; > +} > + > +STATIC ssize_t > +xfs_file_dax_write( > + struct kiocb *iocb, > + struct iov_iter *from) > +{ > + struct address_space *mapping = iocb->ki_filp->f_mapping; > + struct inode *inode = mapping->host; > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > + ssize_t ret = 0; > + int unaligned_io = 0; > + int iolock; > + struct iov_iter data; > + > + /* "unaligned" here means not aligned to a filesystem block */ > + if ((iocb->ki_pos & mp->m_blockmask) || > + ((iocb->ki_pos + iov_iter_count(from)) & mp->m_blockmask)) { > + unaligned_io = 1; > + iolock = XFS_IOLOCK_EXCL; > + } else if (mapping->nrpages) { > + iolock = XFS_IOLOCK_EXCL; > + } else { > + iolock = XFS_IOLOCK_SHARED; > + } > + xfs_rw_ilock(ip, iolock); > + > + ret = xfs_file_aio_write_checks(iocb, from, &iolock); > + if (ret) > + goto out; > + > + /* > + * Yes, even DAX files can have page cache attached to them: A zeroed > + * page is inserted into the pagecache when we have to serve a write > + * fault on a hole. It should never be dirtied and can simply be > + * dropped from the pagecache once we get real data for the page. > */ > - ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); > + if (mapping->nrpages) { > + ret = invalidate_inode_pages2(mapping); > + WARN_ON_ONCE(ret); > + } > + > + if (iolock == XFS_IOLOCK_EXCL && !unaligned_io) { > + xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL); > + iolock = XFS_IOLOCK_SHARED; > + } > + > + trace_xfs_file_dax_write(ip, iov_iter_count(from), iocb->ki_pos); > + > + data = *from; > + ret = dax_do_io(iocb, inode, &data, xfs_get_blocks_direct, > + xfs_end_io_direct_write, 0); > + if (ret > 0) { > + iocb->ki_pos += ret; > + iov_iter_advance(from, ret); > + } > +out: > + xfs_rw_iunlock(ip, iolock); > return ret; > } > > @@ -765,7 +844,9 @@ xfs_file_write_iter( > if (XFS_FORCED_SHUTDOWN(ip->i_mount)) > return -EIO; > > - if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) > + if (IS_DAX(inode)) > + ret = xfs_file_dax_write(iocb, from); > + else if (iocb->ki_flags & IOCB_DIRECT) > ret = xfs_file_dio_aio_write(iocb, from); > else > ret = xfs_file_buffered_aio_write(iocb, from); > diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h > index 2504f94..1451690 100644 > --- a/fs/xfs/xfs_trace.h > +++ b/fs/xfs/xfs_trace.h > @@ -1165,8 +1165,10 @@ DEFINE_EVENT(xfs_file_class, name, \ > TP_ARGS(ip, count, offset)) > DEFINE_RW_EVENT(xfs_file_buffered_read); > DEFINE_RW_EVENT(xfs_file_direct_read); > +DEFINE_RW_EVENT(xfs_file_dax_read); > DEFINE_RW_EVENT(xfs_file_buffered_write); > DEFINE_RW_EVENT(xfs_file_direct_write); > +DEFINE_RW_EVENT(xfs_file_dax_write); > DEFINE_RW_EVENT(xfs_file_splice_read); > > DECLARE_EVENT_CLASS(xfs_page_class, > -- > 2.1.4 > > _______________________________________________ > xfs mailing list > xfs@xxxxxxxxxxx > http://oss.sgi.com/mailman/listinfo/xfs -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html