Re: [PATCH 07/15] btrfs: add dax write support

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, Mar 26, 2019 at 02:02:53PM -0500, Goldwyn Rodrigues wrote:
> From: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>
> 
> IOMAP_F_COW allows to inform the dax code, to first perform
> a copy which are not page-aligned before performing the write.
> 
> A new struct btrfs_iomap is passed from iomap_begin() to
> iomap_end(), which contains all the accounting and locking information
> for CoW based writes.
> 
> For writing to a hole, iomap->cow_addr is set to zero. Would this
> be better handled by a flag or can a valid filesystem block be at
> offset zero of the device?
> 
> Signed-off-by: Goldwyn Rodrigues <rgoldwyn@xxxxxxxx>
> ---
>  fs/btrfs/ctree.h |   6 +++
>  fs/btrfs/dax.c   | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
>  fs/btrfs/file.c  |   4 +-
>  3 files changed, 124 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index a3543a4a063d..3bcd2a4959c1 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -3801,6 +3801,12 @@ int btree_readahead_hook(struct extent_buffer *eb, int err);
>  #ifdef CONFIG_FS_DAX
>  /* dax.c */
>  ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to);
> +ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from);
> +#else
> +static inline ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *from)
> +{
> +	return 0;
> +}
>  #endif /* CONFIG_FS_DAX */
>  
>  static inline int is_fstree(u64 rootid)
> diff --git a/fs/btrfs/dax.c b/fs/btrfs/dax.c
> index bf3d46b0acb6..49619fe3f94f 100644
> --- a/fs/btrfs/dax.c
> +++ b/fs/btrfs/dax.c
> @@ -9,30 +9,124 @@
>  #ifdef CONFIG_FS_DAX
>  #include <linux/dax.h>
>  #include <linux/iomap.h>
> +#include <linux/uio.h>
>  #include "ctree.h"
>  #include "btrfs_inode.h"
>  
> +struct btrfs_iomap {
> +	u64 start;
> +	u64 end;
> +	int nocow;
> +	struct extent_changeset *data_reserved;
> +	struct extent_state *cached_state;
> +};
> +
>  static int btrfs_iomap_begin(struct inode *inode, loff_t pos,
>  		loff_t length, unsigned flags, struct iomap *iomap)
>  {
>  	struct extent_map *em;
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> +
>  	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, pos, length, 0);
> +
> +	if (flags & IOMAP_WRITE) {
> +		int ret = 0, nocow;
> +		struct extent_map *map = em;
> +		struct btrfs_iomap *bi;

Please consider breaking this up into a separate helper before the
btrfs_iomap_begin function becomes long and hard to read like the xfs
one did.  :)

(Granted people also seem to dislike scrolling back and forth...)

> +
> +		bi = kzalloc(sizeof(struct btrfs_iomap), GFP_NOFS);
> +		if (!bi)
> +			return -ENOMEM;
> +
> +		bi->start = round_down(pos, PAGE_SIZE);
> +		bi->end = round_up(pos + length, PAGE_SIZE);
> +
> +		iomap->private = bi;
> +
> +		/* Wait for existing ordered extents in range to finish */
> +		btrfs_wait_ordered_range(inode, bi->start, bi->end - bi->start);
> +
> +		lock_extent_bits(&BTRFS_I(inode)->io_tree, bi->start, bi->end, &bi->cached_state);
> +
> +		ret = btrfs_delalloc_reserve_space(inode, &bi->data_reserved,
> +				bi->start, bi->end - bi->start);
> +		if (ret) {
> +			unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end,
> +					&bi->cached_state);
> +			kfree(bi);
> +			return ret;
> +		}
> +
> +		refcount_inc(&map->refs);
> +		ret = btrfs_get_extent_map_write(&em, NULL,
> +				inode, bi->start, bi->end - bi->start, &nocow);
> +		if (ret) {
> +			unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end,
> +					&bi->cached_state);
> +			btrfs_delalloc_release_space(inode,
> +					bi->data_reserved, bi->start,
> +					bi->end - bi->start, true);
> +			extent_changeset_free(bi->data_reserved);
> +			kfree(bi);
> +			return ret;
> +		}
> +		if (!nocow) {
> +			iomap->flags |= IOMAP_F_COW;
> +			if (map->block_start != EXTENT_MAP_HOLE) {
> +				iomap->cow_addr = map->block_start;
> +				iomap->cow_pos = map->start;

Oh, I see, cow_pos exists because the extent we're copying from and the
extent we're copying into are not necessarily going to be positioned at
the same file offset and (I guess) it's possible that the source range
could be partially sparse given the destination range?

Hmm, no, the previous patch doesn't account for that; it only seems to
know how to handle @cow_pos < @offset.  In that case, why not trim the
cow_* map to @offset?

--D

> +			}
> +		} else {
> +			bi->nocow = 1;
> +		}
> +		free_extent_map(map);
> +	}
> +
> +	iomap->offset = em->start;
> +	iomap->length = em->len;
> +	iomap->bdev = em->bdev;
> +	iomap->dax_dev = fs_info->dax_dev;
> +
>  	if (em->block_start == EXTENT_MAP_HOLE) {
>  		iomap->type = IOMAP_HOLE;
>  		return 0;
>  	}
> +
>  	iomap->type = IOMAP_MAPPED;
> -	iomap->bdev = em->bdev;
> -	iomap->dax_dev = fs_info->dax_dev;
> -	iomap->offset = em->start;
> -	iomap->length = em->len;
>  	iomap->addr = em->block_start;
>  	return 0;
>  }
>  
> +static int btrfs_iomap_end(struct inode *inode, loff_t pos,
> +		loff_t length, ssize_t written, unsigned flags,
> +		struct iomap *iomap)
> +{
> +	struct btrfs_iomap *bi = iomap->private;
> +	u64 wend;
> +
> +	if (!bi)
> +		return 0;
> +
> +	unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start, bi->end,
> +			&bi->cached_state);
> +
> +	wend = round_up(pos + written, PAGE_SIZE);
> +	if (wend < bi->end) {
> +		btrfs_delalloc_release_space(inode,
> +				bi->data_reserved, wend,
> +				bi->end - wend, true);
> +	}
> +
> +	btrfs_update_ordered_extent(inode, bi->start, wend - bi->start, true);
> +	btrfs_delalloc_release_extents(BTRFS_I(inode), wend - bi->start, false);
> +	extent_changeset_free(bi->data_reserved);
> +	kfree(bi);
> +	return 0;
> +}
> +
>  static const struct iomap_ops btrfs_iomap_ops = {
>  	.iomap_begin		= btrfs_iomap_begin,
> +	.iomap_end		= btrfs_iomap_end,
>  };
>  
>  ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to)
> @@ -46,4 +140,21 @@ ssize_t btrfs_file_dax_read(struct kiocb *iocb, struct iov_iter *to)
>  
>  	return ret;
>  }
> +
> +ssize_t btrfs_file_dax_write(struct kiocb *iocb, struct iov_iter *iter)
> +{
> +	ssize_t ret = 0;
> +	u64 pos = iocb->ki_pos;
> +	struct inode *inode = file_inode(iocb->ki_filp);
> +
> +	ret = dax_iomap_rw(iocb, iter, &btrfs_iomap_ops);
> +
> +	if (ret > 0) {
> +		pos += ret;
> +		if (pos > i_size_read(inode))
> +			i_size_write(inode, pos);
> +		iocb->ki_pos = pos;
> +	}
> +	return ret;
> +}
>  #endif /* CONFIG_FS_DAX */
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index b620f4e718b2..3b320d0ab495 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -1964,7 +1964,9 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>  	if (sync)
>  		atomic_inc(&BTRFS_I(inode)->sync_writers);
>  
> -	if (iocb->ki_flags & IOCB_DIRECT) {
> +	if (IS_DAX(inode)) {
> +		num_written = btrfs_file_dax_write(iocb, from);
> +	} else if (iocb->ki_flags & IOCB_DIRECT) {
>  		num_written = __btrfs_direct_write(iocb, from);
>  	} else {
>  		num_written = btrfs_buffered_write(iocb, from);
> -- 
> 2.16.4
> 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux