Re: [PATCH 06/34] xfs: dynamic speculative EOF preallocation

Alex Elder <aelder@xxxxxxx> · Mon, 27 Dec 2010 08:57:36 -0600

On Tue, 2010-12-21 at 18:29 +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@xxxxxxxxxx>
> 
> Currently the size of the speculative preallocation during delayed
> allocation is fixed by either the allocsize mount option of a
> default size. We are seeing a lot of cases where we need to
> recommend using the allocsize mount option to prevent fragmentation
> when buffered writes land in the same AG.

This looks good.  Logarithmic reduction in the size
as the file system gets close to full seems like a
reasonable heuristic.  I have a few minor comments
below, which you can address at your option.
In any case:

Reviewed-by: Alex Elder <aelder@xxxxxxx>

> 
> Rather than using a fixed preallocation size by default (up to 64k),
> make it dynamic by basing it on the current inode size. That way the
> EOF preallocation will increase as the file size increases.  Hence
> for streaming writes we are much more likely to get large
> preallocations exactly when we need it to reduce fragementation.
> 
> For default settings, the size of the initial extents is determined
> by the number of parallel writers and the amount of memory in the
> machine. For 4GB RAM and 4 concurrent 32GB file writes:
> 
> EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 TOTAL
>    0: [0..1048575]:         1048672..2097247      0 (1048672..2097247)      1048576
>    1: [1048576..2097151]:   5242976..6291551      0 (5242976..6291551)      1048576
>    2: [2097152..4194303]:   12583008..14680159    0 (12583008..14680159)    2097152
>    3: [4194304..8388607]:   25165920..29360223    0 (25165920..29360223)    4194304
>    4: [8388608..16777215]:  58720352..67108959    0 (58720352..67108959)    8388608
>    5: [16777216..33554423]: 117440584..134217791  0 (117440584..134217791) 16777208
>    6: [33554424..50331511]: 184549056..201326143  0 (184549056..201326143) 16777088
>    7: [50331512..67108599]: 251657408..268434495  0 (251657408..268434495) 16777088
> 
> and for 16 concurrent 16GB file writes:
> 
>  EXT: FILE-OFFSET           BLOCK-RANGE          AG AG-OFFSET                 TOTAL
>    0: [0..262143]:          2490472..2752615      0 (2490472..2752615)       262144
>    1: [262144..524287]:     6291560..6553703      0 (6291560..6553703)       262144
>    2: [524288..1048575]:    13631592..14155879    0 (13631592..14155879)     524288
>    3: [1048576..2097151]:   30408808..31457383    0 (30408808..31457383)    1048576
>    4: [2097152..4194303]:   52428904..54526055    0 (52428904..54526055)    2097152
>    5: [4194304..8388607]:   104857704..109052007  0 (104857704..109052007)  4194304
>    6: [8388608..16777215]:  209715304..218103911  0 (209715304..218103911)  8388608
>    7: [16777216..33554423]: 452984848..469762055  0 (452984848..469762055) 16777208
> 
> Because it is hard to take back specualtive preallocation, cases
> where there are large slow growing log files on a nearly full
> filesystem may cause premature ENOSPC. Hence as the filesystem nears
> full, the maximum dynamic prealloc size Ñs reduced according to this
> table (based on 4k block size):
> 
> freespace       max prealloc size
>   >5%             full extent (8GB)
>   4-5%             2GB (8GB >> 2)
>   3-4%             1GB (8GB >> 3)
>   2-3%           512MB (8GB >> 4)
>   1-2%           256MB (8GB >> 5)
>   <1%            128MB (8GB >> 6)

On really small filesystems this might be excessive.
On the other hand, you're already basing the size on
the file size so that should limit things just fine.

> This should reduce the amount of space held in speculative
> preallocation for such cases.
> 
> The allocsize mount option turns off the dynamic behaviour and fixes
> the prealloc size to whatever the mount option specifies. i.e. the
> behaviour is unchanged.
> 
> Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
> ---
>  fs/xfs/xfs_fsops.c |    1 +
>  fs/xfs/xfs_iomap.c |   84 +++++++++++++++++++++++++++++++++++++++++++++------
>  fs/xfs/xfs_mount.c |   21 +++++++++++++
>  fs/xfs/xfs_mount.h |   14 ++++++++
>  4 files changed, 110 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> index be34ff2..6d17206 100644
> --- a/fs/xfs/xfs_fsops.c
> +++ b/fs/xfs/xfs_fsops.c
> @@ -374,6 +374,7 @@ xfs_growfs_data_private(
>  		mp->m_maxicount = icount << mp->m_sb.sb_inopblog;
>  	} else
>  		mp->m_maxicount = 0;
> +	xfs_set_low_space_thresholds(mp);
>  
>  	/* update secondary superblocks. */
>  	for (agno = 1; agno < nagcount; agno++) {
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 22b62a1..f36d2c8 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -267,6 +267,9 @@ error_out:
>   * If the caller is doing a write at the end of the file, then extend the
>   * allocation out to the file system's write iosize.  We clean up any extra
>   * space left over when the file is closed in xfs_inactive().
> + *
> + * If we find we already have delalloc preallocation beyond EOF, don't do more
> + * preallocation as it it not needed.
>   */
>  STATIC int
>  xfs_iomap_eof_want_preallocate(
> @@ -282,6 +285,7 @@ xfs_iomap_eof_want_preallocate(
>  	xfs_filblks_t   count_fsb;
>  	xfs_fsblock_t	firstblock;
>  	int		n, error, imaps;
> +	int		found_delalloc = 0;
>  
>  	*prealloc = 0;
>  	if ((offset + count) <= ip->i_size)
> @@ -306,12 +310,60 @@ xfs_iomap_eof_want_preallocate(
>  				return 0;
>  			start_fsb += imap[n].br_blockcount;
>  			count_fsb -= imap[n].br_blockcount;
> +
> +			if (imap[n].br_startblock == DELAYSTARTBLOCK)
> +				found_delalloc = 1;
>  		}
>  	}
> -	*prealloc = 1;
> +	if (!found_delalloc)
> +		*prealloc = 1;

Isn't this a separate change, possibly worthy of its
own commit?

>  	return 0;
>  }
>  
> +/*
> + * If we don't have a user specified preallocation size, dynamically increase
> + * the preallocation size as the size of the file grows. Cap the maximum size
> + * at a single extent or less if the filesystem is near full. The closer the
> + * filesystem is to full, the smaller the maximum prealocation.
> + */
> +STATIC xfs_fsblock_t
> +xfs_iomap_prealloc_size(
> +	struct xfs_mount	*mp,
> +	struct xfs_inode	*ip)
> +{
> +	xfs_fsblock_t		alloc_blocks = 0;
> +
> +	if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
> +		int shift = 0;
> +		int64_t freesp;
> +
> +		alloc_blocks = XFS_B_TO_FSB(mp, ip->i_size);

Why not i_size_read()?  Only matters for 32-bit, but
you're using do_div() below so you do seem to care
about it sometimes.

> +		alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
> +					rounddown_pow_of_two(alloc_blocks));
> +
> +		freesp = percpu_counter_read_positive(
> +						&mp->m_icsb[XFS_ICSB_FDBLOCKS]);
> +		if (freesp < mp->m_low_space[XFS_LOWSP_5_PCNT]) {
> +			shift = 2;
> +			if (freesp < mp->m_low_space[XFS_LOWSP_4_PCNT])
> +				shift++;
> +			if (freesp < mp->m_low_space[XFS_LOWSP_3_PCNT])
> +				shift++;
> +			if (freesp < mp->m_low_space[XFS_LOWSP_2_PCNT])
> +				shift++;
> +			if (freesp < mp->m_low_space[XFS_LOWSP_1_PCNT])
> +				shift++;
> +		}
> +		if (shift)
> +			alloc_blocks >>= shift;
> +	}
> +
> +	if (alloc_blocks < mp->m_writeio_blocks)
> +		alloc_blocks = mp->m_writeio_blocks;
> +
> +	return alloc_blocks;
> +}
> +
>  int
>  xfs_iomap_write_delay(
>  	xfs_inode_t	*ip,
> @@ -344,6 +396,7 @@ xfs_iomap_write_delay(
>  	extsz = xfs_get_extsz_hint(ip);
>  	offset_fsb = XFS_B_TO_FSBT(mp, offset);
>  
> +

Extra blank line.

>  	error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
>  				imap, XFS_WRITE_IMAPS, &prealloc);
>  	if (error)
> @@ -351,9 +404,11 @@ xfs_iomap_write_delay(
>  
>  retry:
>  	if (prealloc) {
> +		xfs_fsblock_t	alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
> +
>  		aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
>  		ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
> -		last_fsb = ioalign + mp->m_writeio_blocks;
> +		last_fsb = ioalign + alloc_blocks;
>  	} else {
>  		last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
>  	}
> @@ -371,22 +426,31 @@ retry:
>  			  XFS_BMAPI_DELAY | XFS_BMAPI_WRITE |
>  			  XFS_BMAPI_ENTIRE, &firstblock, 1, imap,
>  			  &nimaps, NULL);
> -	if (error && (error != ENOSPC))
> +	switch (error) {
> +	case 0:
> +	case ENOSPC:
> +	case EDQUOT:
> +		break;

This (above and below, in this hunk) is also a separate
change, possibly worthy of its own small commit.

> +	default:
>  		return XFS_ERROR(error);
> +	}
>  
>  	/*
> -	 * If bmapi returned us nothing, and if we didn't get back EDQUOT,
> -	 * then we must have run out of space - flush all other inodes with
> -	 * delalloc blocks and retry without EOF preallocation.
> +	 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
> +	 * ENOSPC, * flush all other inodes with delalloc blocks to free up
> +	 * some of the excess reserved metadata space. For both cases, retry
> +	 * without EOF preallocation.
>  	 */
>  	if (nimaps == 0) {
>  		trace_xfs_delalloc_enospc(ip, offset, count);
>  		if (flushed)
> -			return XFS_ERROR(ENOSPC);
> +			return XFS_ERROR(error ? error : ENOSPC);
>  
> -		xfs_iunlock(ip, XFS_ILOCK_EXCL);
> -		xfs_flush_inodes(ip);
> -		xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		if (error == ENOSPC) {
> +			xfs_iunlock(ip, XFS_ILOCK_EXCL);
> +			xfs_flush_inodes(ip);
> +			xfs_ilock(ip, XFS_ILOCK_EXCL);
> +		}
>  
>  		flushed = 1;
>  		error = 0;
> diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> index d5710232..f1b094d 100644
> --- a/fs/xfs/xfs_mount.c
> +++ b/fs/xfs/xfs_mount.c
> @@ -1101,6 +1101,24 @@ xfs_set_rw_sizes(xfs_mount_t *mp)
>  }
>  
>  /*
> + * precalculate the low space thresholds for dynamic speculative preallocation.
> + */
> +void
> +xfs_set_low_space_thresholds(
> +	struct xfs_mount	*mp)
> +{
> +	int i;
> +
> +	for (i = 0; i < XFS_LOWSP_MAX; i++) {
> +		__uint64_t space = mp->m_sb.sb_dblocks;
> +
> +		do_div(space, 100);

How about computing this once, outside the loop?

> +		mp->m_low_space[i] = space * (i + 1);
> +	}
> +}
> +
> +
> +/*
>   * Set whether we're using inode alignment.
>   */
>  STATIC void
> @@ -1322,6 +1340,9 @@ xfs_mountfs(
>  	 */
>  	xfs_set_rw_sizes(mp);
>  
> +	/* set the low space thresholds for dynamic preallocation */
> +	xfs_set_low_space_thresholds(mp);
> +
>  	/*
>  	 * Set the inode cluster size.
>  	 * This may still be overridden by the file system
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 03ad25c6..7b42e04 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -75,6 +75,16 @@ enum {
>  	XFS_ICSB_MAX,
>  };
>  
> +/* dynamic preallocation free space thresholds, 5% down to 1% */
> +enum {
> +	XFS_LOWSP_1_PCNT = 0,

Why not make 1% be 1, 2% be 2, etc.?

> +	XFS_LOWSP_2_PCNT,
> +	XFS_LOWSP_3_PCNT,
> +	XFS_LOWSP_4_PCNT,
> +	XFS_LOWSP_5_PCNT,
> +	XFS_LOWSP_MAX,
> +};
> +
>  typedef struct xfs_mount {
>  	struct super_block	*m_super;
>  	xfs_tid_t		m_tid;		/* next unused tid for fs */
> @@ -169,6 +179,8 @@ typedef struct xfs_mount {
>  						   on the next remount,rw */
>  	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
>  	struct percpu_counter	m_icsb[XFS_ICSB_MAX];
> +	int64_t			m_low_space[XFS_LOWSP_MAX];
> +						/* low free space thresholds */
>  } xfs_mount_t;
>  
>  /*
> @@ -333,6 +345,8 @@ extern void	xfs_icsb_sync_counters(struct xfs_mount *);
>  extern int	xfs_icsb_modify_inodes(struct xfs_mount *, int, int64_t);
>  extern int	xfs_icsb_modify_free_blocks(struct xfs_mount *, int64_t, int);
>  
> +extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
> +
>  #endif	/* __KERNEL__ */
>  
>  extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs