Re: [PATCH 8/8] xfs: introduce an always_cow mode

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Feb 18, 2019 at 10:18:27AM +0100, Christoph Hellwig wrote:
> Add a mode where XFS never overwrites existing blocks in place.  This
> is to aid debugging our COW code, and also put infatructure in place
> for things like possible future support for zoned block devices, which
> can't support overwrites.
> 
> This mode is enabled globally by doing a:
> 
>     echo 1 > /sys/fs/xfs/debug/always_cow
> 
> Note that the parameter is global to allow running all tests in xfstests
> easily in this mode, which would not easily be possible with a per-fs
> sysfs file.
> 
> In always_cow mode persistent preallocations are disabled, and fallocate
> will fail when called with a 0 mode (with our without
> FALLOC_FL_KEEP_SIZE), and not create unwritten extent for zeroed space
> when called with FALLOC_FL_ZERO_RANGE or FALLOC_FL_UNSHARE_RANGE.
> 
> There are a few interesting xfstests failures when run in always_cow
> mode:
> 
>  - generic/392 fails because the bytes used in the file used to test
>    hole punch recovery are less after the log replay.  This is
>    because the blocks written and then punched out are only freed
>    with a delay due to the logging mechanism.
>  - xfs/170 will fail as the already fragile file streams mechanism
>    doesn't seem to interact well with the COW allocator
>  - xfs/180 xfs/182 xfs/192 xfs/198 xfs/204 and xfs/208 will claim
>    the file system is badly fragmented, but there is not much we
>    can do to avoid that when always writing out of place
>  - xfs/205 fails because overwriting a file in always_cow mode
>    will require new space allocation and the assumption in the
>    test thus don't work anymore.
>  - xfs/326 fails to modify the file at all in always_cow mode after
>    injecting the refcount error, leading to an unexpected md5sum
>    after the remount, but that again is expected

Yeah, I saw failures in a bunch of tests when running always_cow against
a 4k blocksize rmap/reflink filesystem:

generic/476 xfs/066 xfs/1501 xfs/064 xfs/296 xfs/205 xfs/056 xfs/198
generic/392 xfs/170 xfs/450 xfs/060 xfs/068 xfs/192 generic/475 xfs/283

Will have a look at those tomorrow...

> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/xfs/xfs_aops.c      |  2 +-
>  fs/xfs/xfs_bmap_util.c |  9 +++------
>  fs/xfs/xfs_file.c      | 27 ++++++++++++++++++++-------
>  fs/xfs/xfs_iomap.c     | 28 ++++++++++++++++++----------
>  fs/xfs/xfs_mount.h     |  1 +
>  fs/xfs/xfs_reflink.c   | 28 ++++++++++++++++++++++++----
>  fs/xfs/xfs_reflink.h   | 13 +++++++++++++
>  fs/xfs/xfs_super.c     | 15 +++++++++++----
>  fs/xfs/xfs_sysctl.h    |  1 +
>  fs/xfs/xfs_sysfs.c     | 24 ++++++++++++++++++++++++
>  10 files changed, 116 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
> index 983d11c27d32..7b8bb6bde981 100644
> --- a/fs/xfs/xfs_aops.c
> +++ b/fs/xfs/xfs_aops.c
> @@ -1023,7 +1023,7 @@ xfs_vm_bmap(
>  	 * Since we don't pass back blockdev info, we can't return bmap
>  	 * information for rt files either.
>  	 */
> -	if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
> +	if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
>  		return 0;
>  	return iomap_bmap(mapping, block, &xfs_iomap_ops);
>  }
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 1ee8c5539fa4..2db43ff4f8b5 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1162,16 +1162,13 @@ xfs_zero_file_space(
>  	 * by virtue of the hole punch.
>  	 */
>  	error = xfs_free_file_space(ip, offset, len);
> -	if (error)
> -		goto out;
> +	if (error || xfs_is_always_cow_inode(ip))
> +		return error;
>  
> -	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
> +	return xfs_alloc_file_space(ip, round_down(offset, blksize),
>  				     round_up(offset + len, blksize) -
>  				     round_down(offset, blksize),
>  				     XFS_BMAPI_PREALLOC);
> -out:
> -	return error;
> -
>  }
>  
>  static int
> diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
> index 1d07dcfbbff3..770cc2edf777 100644
> --- a/fs/xfs/xfs_file.c
> +++ b/fs/xfs/xfs_file.c
> @@ -507,7 +507,7 @@ xfs_file_dio_aio_write(
>  		 * We can't properly handle unaligned direct I/O to reflink
>  		 * files yet, as we can't unshare a partial block.
>  		 */
> -		if (xfs_is_reflink_inode(ip)) {
> +		if (xfs_is_cow_inode(ip)) {
>  			trace_xfs_reflink_bounce_dio_write(ip, iocb->ki_pos, count);
>  			return -EREMCHG;
>  		}
> @@ -872,14 +872,27 @@ xfs_file_fallocate(
>  				goto out_unlock;
>  		}
>  
> -		if (mode & FALLOC_FL_ZERO_RANGE)
> +		if (mode & FALLOC_FL_ZERO_RANGE) {
>  			error = xfs_zero_file_space(ip, offset, len);
> -		else {
> -			if (mode & FALLOC_FL_UNSHARE_RANGE) {
> -				error = xfs_reflink_unshare(ip, offset, len);
> -				if (error)
> -					goto out_unlock;
> +		} else if (mode & FALLOC_FL_UNSHARE_RANGE) {
> +			error = xfs_reflink_unshare(ip, offset, len);
> +			if (error)
> +				goto out_unlock;
> +
> +			if (!xfs_is_always_cow_inode(ip)) {
> +				error = xfs_alloc_file_space(ip, offset, len,
> +						XFS_BMAPI_PREALLOC);
>  			}
> +		} else {
> +			/*
> +			 * If always_cow mode we can't use preallocations and
> +			 * thus should not create them.
> +			 */
> +			if (xfs_is_always_cow_inode(ip)) {
> +				error = -EOPNOTSUPP;
> +				goto out_unlock;
> +			}
> +
>  			error = xfs_alloc_file_space(ip, offset, len,
>  						     XFS_BMAPI_PREALLOC);
>  		}
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 7b71dcc97ef0..72533e9dddc6 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -395,12 +395,13 @@ xfs_quota_calc_throttle(
>  STATIC xfs_fsblock_t
>  xfs_iomap_prealloc_size(
>  	struct xfs_inode	*ip,
> +	int			whichfork,
>  	loff_t			offset,
>  	loff_t			count,
>  	struct xfs_iext_cursor	*icur)
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
> -	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
> +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
>  	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  	struct xfs_bmbt_irec	prev;
>  	int			shift = 0;
> @@ -593,7 +594,11 @@ xfs_file_iomap_begin_delay(
>  	 * themselves.  Second the lookup in the extent list is generally faster
>  	 * than going out to the shared extent tree.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
> +		if (!ip->i_cowfp) {
> +			ASSERT(!xfs_is_reflink_inode(ip));
> +			xfs_ifork_init_cow(ip);
> +		}
>  		cow_eof = !xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb,
>  				&ccur, &cmap);
>  		if (!cow_eof && cmap.br_startoff <= offset_fsb) {
> @@ -609,7 +614,7 @@ xfs_file_iomap_begin_delay(
>  		 * overwriting shared extents.   This includes zeroing of
>  		 * existing extents that contain data.
>  		 */
> -		if (!xfs_is_reflink_inode(ip) ||
> +		if (!xfs_is_cow_inode(ip) ||
>  		    ((flags & IOMAP_ZERO) && imap.br_state != XFS_EXT_NORM)) {
>  			trace_xfs_iomap_found(ip, offset, count, XFS_DATA_FORK,
>  					&imap);
> @@ -619,7 +624,7 @@ xfs_file_iomap_begin_delay(
>  		xfs_trim_extent(&imap, offset_fsb, end_fsb - offset_fsb);
>  
>  		/* Trim the mapping to the nearest shared extent boundary. */
> -		error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
> +		error = xfs_inode_need_cow(ip, &imap, &shared);
>  		if (error)
>  			goto out_unlock;
>  
> @@ -648,15 +653,18 @@ xfs_file_iomap_begin_delay(
>  		 */
>  		count = min_t(loff_t, count, 1024 * PAGE_SIZE);
>  		end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
> +
> +		if (xfs_is_always_cow_inode(ip))
> +			whichfork = XFS_COW_FORK;
>  	}
>  
>  	error = xfs_qm_dqattach_locked(ip, false);
>  	if (error)
>  		goto out_unlock;
>  
> -	if (eof && whichfork == XFS_DATA_FORK) {
> -		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count,
> -				&icur);
> +	if (eof) {
> +		prealloc_blocks = xfs_iomap_prealloc_size(ip, whichfork, offset,
> +				count, &icur);
>  		if (prealloc_blocks) {
>  			xfs_extlen_t	align;
>  			xfs_off_t	end_offset;
> @@ -867,7 +875,7 @@ xfs_ilock_for_iomap(
>  	 * COW writes may allocate delalloc space or convert unwritten COW
>  	 * extents, so we need to make sure to take the lock exclusively here.
>  	 */
> -	if (xfs_is_reflink_inode(ip) && is_write) {
> +	if (xfs_is_cow_inode(ip) && is_write) {
>  		/*
>  		 * FIXME: It could still overwrite on unshared extents and not
>  		 * need allocation.
> @@ -901,7 +909,7 @@ xfs_ilock_for_iomap(
>  	 * check, so if we got ILOCK_SHARED for a write and but we're now a
>  	 * reflink inode we have to switch to ILOCK_EXCL and relock.
>  	 */
> -	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_reflink_inode(ip)) {
> +	if (mode == XFS_ILOCK_SHARED && is_write && xfs_is_cow_inode(ip)) {
>  		xfs_iunlock(ip, mode);
>  		mode = XFS_ILOCK_EXCL;
>  		goto relock;
> @@ -973,7 +981,7 @@ xfs_file_iomap_begin(
>  	 * Break shared extents if necessary. Checks for non-blocking IO have
>  	 * been done up front, so we don't need to do them here.
>  	 */
> -	if (xfs_is_reflink_inode(ip)) {
> +	if (xfs_is_cow_inode(ip)) {
>  		struct xfs_bmbt_irec	orig = imap;
>  
>  		/* if zeroing doesn't need COW allocation, then we are done. */
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index a33f45077867..fa8b37d61838 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -194,6 +194,7 @@ typedef struct xfs_mount {
>  	 */
>  	uint32_t		m_generation;
>  
> +	bool			m_always_cow;
>  	bool			m_fail_unmount;
>  #ifdef DEBUG
>  	/*
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index f84b37fa4f17..e2d9179bd50d 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -192,7 +192,7 @@ xfs_reflink_trim_around_shared(
>  	int			error = 0;
>  
>  	/* Holes, unwritten, and delalloc extents cannot be shared */
> -	if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
> +	if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
>  		*shared = false;
>  		return 0;
>  	}
> @@ -234,6 +234,23 @@ xfs_reflink_trim_around_shared(
>  	}
>  }
>  
> +bool
> +xfs_inode_need_cow(

xfs_inode_trim_to_cow() ?

Otherwise generally looks ok to me, but it's late so I'll send this now
and get back to review in the morning.

--D

> +	struct xfs_inode	*ip,
> +	struct xfs_bmbt_irec	*imap,
> +	bool			*shared)
> +{
> +	/* We can't update any real extents in always COW mode. */
> +	if (xfs_is_always_cow_inode(ip) &&
> +	    !isnullstartblock(imap->br_startblock)) {
> +		*shared = true;
> +		return 0;
> +	}
> +
> +	/* Trim the mapping to the nearest shared extent boundary. */
> +	return xfs_reflink_trim_around_shared(ip, imap, shared);
> +}
> +
>  static int
>  xfs_reflink_convert_cow_locked(
>  	struct xfs_inode	*ip,
> @@ -321,7 +338,7 @@ xfs_find_trim_cow_extent(
>  	if (got.br_startoff > offset_fsb) {
>  		xfs_trim_extent(imap, imap->br_startoff,
>  				got.br_startoff - imap->br_startoff);
> -		return xfs_reflink_trim_around_shared(ip, imap, shared);
> +		return xfs_inode_need_cow(ip, imap, shared);
>  	}
>  
>  	*shared = true;
> @@ -356,7 +373,10 @@ xfs_reflink_allocate_cow(
>  	xfs_extlen_t		resblks = 0;
>  
>  	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	if (!ip->i_cowfp) {
> +		ASSERT(!xfs_is_reflink_inode(ip));
> +		xfs_ifork_init_cow(ip);
> +	}
>  
>  	error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
>  	if (error || !*shared)
> @@ -542,7 +562,7 @@ xfs_reflink_cancel_cow_range(
>  	int			error;
>  
>  	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
> -	ASSERT(xfs_is_reflink_inode(ip));
> +	ASSERT(ip->i_cowfp);
>  
>  	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
>  	if (count == NULLFILEOFF)
> diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
> index 4a9e3cd4768a..2a3052fbe23e 100644
> --- a/fs/xfs/xfs_reflink.h
> +++ b/fs/xfs/xfs_reflink.h
> @@ -6,11 +6,24 @@
>  #ifndef __XFS_REFLINK_H
>  #define __XFS_REFLINK_H 1
>  
> +static inline bool xfs_is_always_cow_inode(struct xfs_inode *ip)
> +{
> +	return ip->i_mount->m_always_cow &&
> +		xfs_sb_version_hasreflink(&ip->i_mount->m_sb);
> +}
> +
> +static inline bool xfs_is_cow_inode(struct xfs_inode *ip)
> +{
> +	return xfs_is_reflink_inode(ip) || xfs_is_always_cow_inode(ip);
> +}
> +
>  extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
>  		xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
>  		xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
>  extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *irec, bool *shared);
> +bool xfs_inode_need_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap,
> +		bool *shared);
>  
>  extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
>  		struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode,
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index c9097cb0b955..6be183a391b6 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1729,11 +1729,18 @@ xfs_fs_fill_super(
>  		}
>  	}
>  
> -	if (xfs_sb_version_hasreflink(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> -		xfs_alert(mp,
> +	if (xfs_sb_version_hasreflink(&mp->m_sb)) {
> +		if (mp->m_sb.sb_rblocks) {
> +			xfs_alert(mp,
>  	"reflink not compatible with realtime device!");
> -		error = -EINVAL;
> -		goto out_filestream_unmount;
> +			error = -EINVAL;
> +			goto out_filestream_unmount;
> +		}
> +
> +		if (xfs_globals.always_cow) {
> +			xfs_info(mp, "using DEBUG-only always_cow mode.");
> +			mp->m_always_cow = true;
> +		}
>  	}
>  
>  	if (xfs_sb_version_hasrmapbt(&mp->m_sb) && mp->m_sb.sb_rblocks) {
> diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
> index 168488130a19..ad7f9be13087 100644
> --- a/fs/xfs/xfs_sysctl.h
> +++ b/fs/xfs/xfs_sysctl.h
> @@ -85,6 +85,7 @@ struct xfs_globals {
>  	int	log_recovery_delay;	/* log recovery delay (secs) */
>  	int	mount_delay;		/* mount setup delay (secs) */
>  	bool	bug_on_assert;		/* BUG() the kernel on assert failure */
> +	bool	always_cow;		/* use COW fork for all overwrites */
>  };
>  extern struct xfs_globals	xfs_globals;
>  
> diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
> index cd6a994a7250..cabda13f3c64 100644
> --- a/fs/xfs/xfs_sysfs.c
> +++ b/fs/xfs/xfs_sysfs.c
> @@ -183,10 +183,34 @@ mount_delay_show(
>  }
>  XFS_SYSFS_ATTR_RW(mount_delay);
>  
> +static ssize_t
> +always_cow_store(
> +	struct kobject	*kobject,
> +	const char	*buf,
> +	size_t		count)
> +{
> +	ssize_t		ret;
> +
> +	ret = kstrtobool(buf, &xfs_globals.always_cow);
> +	if (ret < 0)
> +		return ret;
> +	return count;
> +}
> +
> +static ssize_t
> +always_cow_show(
> +	struct kobject	*kobject,
> +	char		*buf)
> +{
> +	return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.always_cow);
> +}
> +XFS_SYSFS_ATTR_RW(always_cow);
> +
>  static struct attribute *xfs_dbg_attrs[] = {
>  	ATTR_LIST(bug_on_assert),
>  	ATTR_LIST(log_recovery_delay),
>  	ATTR_LIST(mount_delay),
> +	ATTR_LIST(always_cow),
>  	NULL,
>  };
>  
> -- 
> 2.20.1
> 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux