On Mon, Apr 29, 2019 at 12:26:38PM -0500, Goldwyn Rodrigues wrote: > From: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx> > > IOMAP_DAX_COW allows to inform the dax code, to first perform > a copy which are not page-aligned before performing the write. > The responsibility of checking if data edges are page aligned > is performed in ->iomap_begin() and the source address is > stored in ->inline_data > > A new struct btrfs_iomap is passed from iomap_begin() to > iomap_end(), which contains all the accounting and locking information > for CoW based writes. > > For writing to a hole, iomap->inline_data is set to zero. > > Signed-off-by: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx> > --- > fs/btrfs/ctree.h | 6 ++ > fs/btrfs/dax.c | 182 +++++++++++++++++++++++++++++++++++++++++++++++++++++-- > fs/btrfs/file.c | 4 +- > 3 files changed, 185 insertions(+), 7 deletions(-) > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > index 1e3e758b83c2..eec01eb92f33 100644 > --- a/fs/btrfs/ctree.h > +++ b/fs/btrfs/ctree.h > @@ -3801,6 +3801,12 @@ int btree_readahead_hook(struct extent_buffer *eb, int err); > #ifdef CONFIG_FS_DAX > /* dax.c */ > ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to); > +ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from); > +#else > +static inline ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from) > +{ > + return 0; > +} > #endif /* CONFIG_FS_DAX */ > > static inline int is_fstree(u64 rootid) > diff --git a/fs/btrfs/dax.c b/fs/btrfs/dax.c > index bf3d46b0acb6..f5cc9bcdbf14 100644 > --- a/fs/btrfs/dax.c > +++ b/fs/btrfs/dax.c > @@ -9,30 +9,184 @@ > #ifdef CONFIG_FS_DAX > #include <linux/dax.h> > #include <linux/iomap.h> > +#include <linux/uio.h> > #include "ctree.h" > #include "btrfs_inode.h" > > +struct btrfs_iomap { > + u64 start; > + u64 end; > + bool nocow; > + struct extent_changeset *data_reserved; > + struct extent_state *cached_state; > +}; > + > +static struct btrfs_iomap *btrfs_iomap_init(struct inode *inode, > + struct extent_map **em, > + loff_t pos, loff_t length) > +{ > + int ret = 0; > + struct extent_map *map = *em; > + struct btrfs_iomap *bi; > + > + bi = kzalloc(sizeof(struct btrfs_iomap), GFP_NOFS); > + if (!bi) > + return ERR_PTR(-ENOMEM); > + > + bi->start = round_down(pos, PAGE_SIZE); > + bi->end = PAGE_ALIGN(pos + length); > + > + /* Wait for existing ordered extents in range to finish */ > + btrfs_wait_ordered_range(inode, bi->start, bi->end - bi->start); > + > + lock_extent_bits(&BTRFS_I(inode)->io_tree, bi->start, bi->end, &bi->cached_state); > + > + ret = btrfs_delalloc_reserve_space(inode, &bi->data_reserved, > + bi->start, bi->end - bi->start); > + if (ret) { > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end, > + &bi->cached_state); > + kfree(bi); > + return ERR_PTR(ret); > + } > + > + refcount_inc(&map->refs); > + ret = btrfs_get_extent_map_write(em, NULL, > + inode, bi->start, bi->end - bi->start, &bi->nocow); > + if (ret) { > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end, > + &bi->cached_state); > + btrfs_delalloc_release_space(inode, > + bi->data_reserved, bi->start, > + bi->end - bi->start, true); > + extent_changeset_free(bi->data_reserved); > + kfree(bi); > + return ERR_PTR(ret); > + } > + free_extent_map(map); > + return bi; > +} > + > +static void *dax_address(struct block_device *bdev, struct dax_device *dax_dev, > + sector_t sector, loff_t pos, loff_t length) This looks like a common function for fs/dax.c. --D > +{ > + size_t size = ALIGN(pos + length, PAGE_SIZE); > + int id, ret = 0; > + void *kaddr = NULL; > + pgoff_t pgoff; > + long map_len; > + > + id = dax_read_lock(); > + > + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); > + if (ret) > + goto out; > + > + map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), > + &kaddr, NULL); > + if (map_len < 0) > + ret = map_len; > + > +out: > + dax_read_unlock(id); > + if (ret) > + return ERR_PTR(ret); > + return kaddr; > +} > + > static int btrfs_iomap_begin(struct inode *inode, loff_t pos, > loff_t length, unsigned flags, struct iomap *iomap) > { > struct extent_map *em; > struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); > + struct btrfs_iomap *bi = NULL; > + unsigned offset = pos & (PAGE_SIZE - 1); > + u64 srcblk = 0; > + loff_t diff; > + > em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, pos, length, 0); > + > + iomap->type = IOMAP_MAPPED; > + > + if (flags & IOMAP_WRITE) { > + if (em->block_start != EXTENT_MAP_HOLE) > + srcblk = em->block_start + pos - em->start - offset; > + > + bi = btrfs_iomap_init(inode, &em, pos, length); > + if (IS_ERR(bi)) > + return PTR_ERR(bi); > + > + } > + > + /* > + * Advance the difference between pos and start, to align well with > + * inline_data in case of writes > + */ > + diff = round_down(pos - em->start, PAGE_SIZE); > + iomap->offset = em->start + diff; > + iomap->length = em->len - diff; > + iomap->bdev = em->bdev; > + iomap->dax_dev = fs_info->dax_dev; > + > + /* > + * This will be true for reads only since we have already > + * allocated em > + */ > if (em->block_start == EXTENT_MAP_HOLE) { > iomap->type = IOMAP_HOLE; > return 0; > } > - iomap->type = IOMAP_MAPPED; > - iomap->bdev = em->bdev; > - iomap->dax_dev = fs_info->dax_dev; > - iomap->offset = em->start; > - iomap->length = em->len; > - iomap->addr = em->block_start; > + > + iomap->addr = em->block_start + diff; > + /* Check if we really need to copy data from old extent */ > + if (bi && !bi->nocow && (offset || pos + length < bi->end)) { > + iomap->type = IOMAP_DAX_COW; > + if (srcblk) { > + sector_t sector = (srcblk + (pos & PAGE_MASK) - > + iomap->offset) >> 9; > + iomap->inline_data = dax_address(em->bdev, > + fs_info->dax_dev, sector, pos, length); > + if (IS_ERR(iomap->inline_data)) { > + kfree(bi); > + return PTR_ERR(iomap->inline_data); > + } > + } > + } > + > + iomap->private = bi; > + return 0; > +} > + > +static int btrfs_iomap_end(struct inode *inode, loff_t pos, > + loff_t length, ssize_t written, unsigned flags, > + struct iomap *iomap) > +{ > + struct btrfs_iomap *bi = iomap->private; > + u64 wend; > + > + if (!bi) > + return 0; > + > + unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end, > + &bi->cached_state); > + > + wend = PAGE_ALIGN(pos + written); > + if (wend < bi->end) { > + btrfs_delalloc_release_space(inode, > + bi->data_reserved, wend, > + bi->end - wend, true); > + } > + > + btrfs_update_ordered_extent(inode, bi->start, wend - bi->start, true); > + btrfs_delalloc_release_extents(BTRFS_I(inode), wend - bi->start, false); > + extent_changeset_free(bi->data_reserved); > + kfree(bi); > return 0; > } > > static const struct iomap_ops btrfs_iomap_ops = { > .iomap_begin = btrfs_iomap_begin, > + .iomap_end = btrfs_iomap_end, > }; > > ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to) > @@ -46,4 +200,20 @@ ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to) > > return ret; > } > + > +ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *iter) > +{ > + ssize_t ret = 0; > + u64 pos = iocb->ki_pos; > + struct inode *inode = file_inode(iocb->ki_filp); > + ret = dax_iomap_rw(iocb, iter, &btrfs_iomap_ops); > + > + if (ret > 0) { > + pos += ret; > + if (pos > i_size_read(inode)) > + i_size_write(inode, pos); > + iocb->ki_pos = pos; > + } > + return ret; > +} > #endif /* CONFIG_FS_DAX */ > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c > index 9194591f9eea..a795023e26ca 100644 > --- a/fs/btrfs/file.c > +++ b/fs/btrfs/file.c > @@ -1964,7 +1964,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, > if (sync) > atomic_inc(&BTRFS_I(inode)->sync_writers); > > - if (iocb->ki_flags & IOCB_DIRECT) { > + if (IS_DAX(inode)) { > + num_written = btrfs_file_dax_write(iocb, from); > + } else if (iocb->ki_flags & IOCB_DIRECT) { > num_written = __btrfs_direct_write(iocb, from); > } else { > num_written = btrfs_buffered_write(iocb, from); > -- > 2.16.4 >