On Mon, Feb 18, 2019 at 10:18:21AM +0100, Christoph Hellwig wrote: > We speculatively allocate extents in the COW fork to reduce > fragmentation. But when we write data into such COW fork blocks that > do now shadow an allocation in the data fork SEEK_DATA will not > correctly report it, as it only looks at the data fork extents. > The only reason why that hasn't been an issue so far is because > we even use these speculative COW fork preallocations over holes in > the data fork at all for buffered writes, and blocks in the COW > fork that are written by direct writes are moved into the data > fork immediately at I/O completion time. > > Add a new set of iomap_ops for SEEK_HOLE/SEEK_DATA which looks into > both the COW and data fork, and reports all COW extents as unwritten > to the iomap layer. While this isn't strictly true for COW fork > extents that were already converted to real extents, the practical > semantics that you can't read data from them until they are moved > into the data fork are very similar, and this will force the iomap > layer into probing the extents for actually present data. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/xfs/xfs_file.c | 4 +-- > fs/xfs/xfs_iomap.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_iomap.h | 1 + > 3 files changed, 88 insertions(+), 2 deletions(-) > > diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c > index e47425071e65..1d07dcfbbff3 100644 > --- a/fs/xfs/xfs_file.c > +++ b/fs/xfs/xfs_file.c > @@ -1068,10 +1068,10 @@ xfs_file_llseek( > default: > return generic_file_llseek(file, offset, whence); > case SEEK_HOLE: > - offset = iomap_seek_hole(inode, offset, &xfs_iomap_ops); > + offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops); > break; > case SEEK_DATA: > - offset = iomap_seek_data(inode, offset, &xfs_iomap_ops); > + offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops); > break; > } > > diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c > index 284c5e68f695..a7599b084571 100644 > --- a/fs/xfs/xfs_iomap.c > +++ b/fs/xfs/xfs_iomap.c > @@ -1068,6 +1068,91 @@ const struct iomap_ops xfs_iomap_ops = { > .iomap_end = xfs_file_iomap_end, > }; > > +static int > +xfs_seek_iomap_begin( > + struct inode *inode, > + loff_t offset, > + loff_t length, > + unsigned flags, > + struct iomap *iomap) > +{ > + struct xfs_inode *ip = XFS_I(inode); > + struct xfs_mount *mp = ip->i_mount; > + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); > + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + length); > + xfs_fileoff_t cow_fsb = NULLFILEOFF, data_fsb = NULLFILEOFF; > + struct xfs_iext_cursor icur; > + struct xfs_bmbt_irec imap, cmap; > + int error = 0; > + unsigned lockmode; > + > + if (XFS_FORCED_SHUTDOWN(mp)) > + return -EIO; > + > + lockmode = xfs_ilock_data_map_shared(ip); > + if (!(ip->i_df.if_flags & XFS_IFEXTENTS)) { > + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); > + if (error) > + goto out_unlock; > + } > + > + if (xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap)) { > + /* > + * If we found a data extent we are done. > + */ > + if (imap.br_startoff <= offset_fsb) > + goto done; > + data_fsb = imap.br_startoff; > + } else { > + /* > + * Fake a hole until the end of the file. > + */ > + data_fsb = min(XFS_B_TO_FSB(mp, offset + length), > + XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes)); > + } > + > + /* > + * If a COW fork extent covers the hole, report it - capped to the next > + * data fork extent: > + */ > + if (xfs_inode_has_cow_data(ip) && > + xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) > + cow_fsb = cmap.br_startoff; > + if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) { > + if (data_fsb < cow_fsb + cmap.br_blockcount) > + end_fsb = min(end_fsb, data_fsb); > + xfs_trim_extent(&cmap, offset_fsb, end_fsb); > + error = xfs_bmbt_to_iomap(ip, iomap, &cmap, true); > + /* > + * Force page cache probing to avoid reporting COW extents as > + * data even if they haven't been written to: This comment is a little awkward, I'll change it to: /* * This is a COW extent, so we must probe the page cache because there * could be dirty page cache being backed by this extent. */ Otherwise looks fine to me, Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --D > + */ > + iomap->type = IOMAP_UNWRITTEN; > + goto out_unlock; > + } > + > + /* > + * Else report a hole, capped to the next found data or COW extent. > + */ > + if (cow_fsb != NULLFILEOFF && cow_fsb < data_fsb) > + imap.br_blockcount = cow_fsb - offset_fsb; > + else > + imap.br_blockcount = data_fsb - offset_fsb; > + imap.br_startoff = offset_fsb; > + imap.br_startblock = HOLESTARTBLOCK; > + imap.br_state = XFS_EXT_NORM; > +done: > + xfs_trim_extent(&imap, offset_fsb, end_fsb); > + error = xfs_bmbt_to_iomap(ip, iomap, &imap, false); > +out_unlock: > + xfs_iunlock(ip, lockmode); > + return error; > +} > + > +const struct iomap_ops xfs_seek_iomap_ops = { > + .iomap_begin = xfs_seek_iomap_begin, > +}; > + > static int > xfs_xattr_iomap_begin( > struct inode *inode, > diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h > index 37b584c3069b..5c2f6aa6d78f 100644 > --- a/fs/xfs/xfs_iomap.h > +++ b/fs/xfs/xfs_iomap.h > @@ -40,6 +40,7 @@ xfs_aligned_fsb_count( > } > > extern const struct iomap_ops xfs_iomap_ops; > +extern const struct iomap_ops xfs_seek_iomap_ops; > extern const struct iomap_ops xfs_xattr_iomap_ops; > > #endif /* __XFS_IOMAP_H__*/ > -- > 2.20.1 >