On Mon, 2019-09-09 at 20:27 +0200, Christoph Hellwig wrote: > From: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx> > > The srcmap is used to identify where the read is to be performed > from. > It is passed to ->iomap_begin, which can fill it in if we need to > read > data for partially written blocks from a different location than the > write target. The srcmap is only supported for buffered writes so > far. > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx> > [hch: merged two patches, removed the IOMAP_F_COW flag, use iomap as > srcmap if not set, adjust length down to srcmap end as well] > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/dax.c | 9 ++++-- > fs/ext2/inode.c | 2 +- > fs/ext4/inode.c | 2 +- > fs/gfs2/bmap.c | 3 +- > fs/iomap/apply.c | 23 +++++++++++---- > fs/iomap/buffered-io.c | 65 +++++++++++++++++++++++----------------- > -- > fs/iomap/direct-io.c | 2 +- > fs/iomap/fiemap.c | 4 +-- > fs/iomap/seek.c | 4 +-- > fs/iomap/swapfile.c | 3 +- > fs/xfs/xfs_iomap.c | 9 ++++-- > include/linux/iomap.h | 5 ++-- > 12 files changed, 79 insertions(+), 52 deletions(-) > > diff --git a/fs/dax.c b/fs/dax.c > index a237141d8787..8016be24bbd9 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1090,7 +1090,7 @@ EXPORT_SYMBOL_GPL(__dax_zero_page_range); > > static loff_t > dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void > *data, > - struct iomap *iomap) > + struct iomap *iomap, struct iomap *srcmap) > { > struct block_device *bdev = iomap->bdev; > struct dax_device *dax_dev = iomap->dax_dev; > @@ -1248,6 +1248,7 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > unsigned long vaddr = vmf->address; > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > unsigned flags = IOMAP_FAULT; > int error, major = 0; > bool write = vmf->flags & FAULT_FLAG_WRITE; > @@ -1292,7 +1293,7 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > * the file system block size to be equal the page size, > which means > * that we never have to deal with more than a single extent > here. > */ > - error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, > &iomap); > + error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, > &iomap, &srcmap); > if (iomap_errp) > *iomap_errp = error; > if (error) { > @@ -1472,6 +1473,7 @@ static vm_fault_t dax_iomap_pmd_fault(struct > vm_fault *vmf, pfn_t *pfnp, > struct inode *inode = mapping->host; > vm_fault_t result = VM_FAULT_FALLBACK; > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > pgoff_t max_pgoff; > void *entry; > loff_t pos; > @@ -1546,7 +1548,8 @@ static vm_fault_t dax_iomap_pmd_fault(struct > vm_fault *vmf, pfn_t *pfnp, > * to look up our filesystem block. > */ > pos = (loff_t)xas.xa_index << PAGE_SHIFT; > - error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, > &iomap); > + error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, > &iomap, > + &srcmap); > if (error) > goto unlock_entry; > > diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c > index 7004ce581a32..467c13ff6b40 100644 > --- a/fs/ext2/inode.c > +++ b/fs/ext2/inode.c > @@ -801,7 +801,7 @@ int ext2_get_block(struct inode *inode, sector_t > iblock, > > #ifdef CONFIG_FS_DAX > static int ext2_iomap_begin(struct inode *inode, loff_t offset, > loff_t length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, struct iomap > *srcmap) > { > unsigned int blkbits = inode->i_blkbits; > unsigned long first_block = offset >> blkbits; > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 420fe3deed39..e2116e8228d2 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -3453,7 +3453,7 @@ static bool ext4_inode_datasync_dirty(struct > inode *inode) > } > > static int ext4_iomap_begin(struct inode *inode, loff_t offset, > loff_t length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, struct iomap > *srcmap) > { > struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); > unsigned int blkbits = inode->i_blkbits; > diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c > index 79581b9bdebb..0bf8e8fa82bd 100644 > --- a/fs/gfs2/bmap.c > +++ b/fs/gfs2/bmap.c > @@ -1123,7 +1123,8 @@ static int gfs2_iomap_begin_write(struct inode > *inode, loff_t pos, > } > > static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t > length, > - unsigned flags, struct iomap *iomap) > + unsigned flags, struct iomap *iomap, > + struct iomap *srcmap) > { > struct gfs2_inode *ip = GFS2_I(inode); > struct metapath mp = { .mp_aheight = 1, }; > diff --git a/fs/iomap/apply.c b/fs/iomap/apply.c > index 54c02aecf3cd..67efd86675e6 100644 > --- a/fs/iomap/apply.c > +++ b/fs/iomap/apply.c > @@ -24,7 +24,9 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t > length, unsigned flags, > const struct iomap_ops *ops, void *data, > iomap_actor_t actor) > { > struct iomap iomap = { 0 }; > + struct iomap srcmap = { 0 }; > loff_t written = 0, ret; > + u64 end; > > /* > * Need to map a range from start position for length bytes. > This can > @@ -38,7 +40,7 @@ iomap_apply(struct inode *inode, loff_t pos, loff_t > length, unsigned flags, > * expose transient stale data. If the reserve fails, we can > safely > * back out at this point as there is nothing to undo. > */ > - ret = ops->iomap_begin(inode, pos, length, flags, &iomap); > + ret = ops->iomap_begin(inode, pos, length, flags, &iomap, > &srcmap); > if (ret) > return ret; > if (WARN_ON(iomap.offset > pos)) > @@ -50,15 +52,26 @@ iomap_apply(struct inode *inode, loff_t pos, > loff_t length, unsigned flags, > * Cut down the length to the one actually provided by the > filesystem, > * as it might not be able to give us the whole size that we > requested. > */ > - if (iomap.offset + iomap.length < pos + length) > - length = iomap.offset + iomap.length - pos; > + end = iomap.offset + iomap.length; > + if (srcmap.type) > + end = min(end, srcmap.offset + srcmap.length); > + if (pos + length > end) > + length = end - pos; > Yes, that looks more correct. However, can we be smart and not bother setting the minimum of end and srcmap.offset + srcmap.length if it is not required? ie in situations where end coincides with block boundary. Or if srcmap.length falls short, until the last block boundary of iomap? I did think about this scenario. This case is specific to CoW and thought this is best handled by filesystem's iomap_begin(). If this goes in, the filesystems would have to "falsify" srcmap length information to maximize the amount of I/O that goes in one iteration. -- Goldwyn