Re: [PATCH v2] xfs: use per-AG reservations for the finobt

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Jan 23, 2017 at 07:05:42PM +0100, Christoph Hellwig wrote:
> Currently we try to rely on the global reserved block pool for block
> allocations for the free inode btree, but I have customer reports
> (fairly complex workload, need to find an easier reproducer) where that
> is not enough as the AG where we free an inode that requires a new
> finobt block is entirely full.  This causes us to cancel a dirty
> transaction and thus a file system shutdown.
> 
> I think the right way to guard against this is to treat the finot the same
> way as the refcount btree and have a per-AG reservations for the possible
> worst case size of it, and the patch below implements that.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> 
> ---
> 
> Changes since V1:
>  - reduce the size of the reservation
>  - warn and fall back to the old somewhat buggy code if we can't
>    get the reservation at mount time
>  - read the AGF before asserting based on values in it
> ---
>  fs/xfs/libxfs/xfs_ag_resv.c      | 47 ++++++++++++++++++---
>  fs/xfs/libxfs/xfs_ialloc.h       |  1 -
>  fs/xfs/libxfs/xfs_ialloc_btree.c | 90 ++++++++++++++++++++++++++++++++++++++--
>  fs/xfs/libxfs/xfs_ialloc_btree.h |  3 ++
>  fs/xfs/xfs_inode.c               | 23 +++++-----
>  fs/xfs/xfs_mount.h               |  1 +
>  6 files changed, 144 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
> index d346d42..4773c1e 100644
> --- a/fs/xfs/libxfs/xfs_ag_resv.c
> +++ b/fs/xfs/libxfs/xfs_ag_resv.c
> @@ -39,6 +39,7 @@
>  #include "xfs_rmap_btree.h"
>  #include "xfs_btree.h"
>  #include "xfs_refcount_btree.h"
> +#include "xfs_ialloc_btree.h"
>  
>  /*
>   * Per-AG Block Reservations
> @@ -223,6 +224,8 @@ int
>  xfs_ag_resv_init(
>  	struct xfs_perag		*pag)
>  {
> +	struct xfs_mount		*mp = pag->pag_mount;
> +	xfs_agnumber_t			agno = pag->pag_agno;
>  	xfs_extlen_t			ask;
>  	xfs_extlen_t			used;
>  	int				error = 0;
> @@ -231,23 +234,48 @@ xfs_ag_resv_init(
>  	if (pag->pag_meta_resv.ar_asked == 0) {
>  		ask = used = 0;
>  
> -		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
> -				pag->pag_agno, &ask, &used);
> +		error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
>  
> -		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> -				ask, used);
> +		error = xfs_finobt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
> +
> +		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> +				ask, used);
> +		if (error) {
> +			/*
> +			 * Because we didn't have per-AG reservations when the
> +			 * finobt feature was added we might not ne able to
> +			 * reservere all needed blocks.  Warn and fall back to
> +			 * the old and potentially buggy code in that case,
> +			 * but ensure we do have the reservation for the
> +			 * refcountbt.
> +			 */
> +			ask = used = 0;
> +
> +			error = xfs_refcountbt_calc_reserves(mp, agno, &ask,
> +					&used);
> +			if (error)
> +				goto out;
> +
> +			error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
> +					ask, used);

But... _init decreases m_ag_max_usable prior to calling _mod_fdblocks,
so double-calling _init decreases m_ag_max_usable twice.

OTOH if the _mod_fdblocks call fails then m_fdblocks hasn't been
updated, which means that we can't call _ag_resv_free to reset
m_ag_max_usable prior to retrying _init.

I think we could solve this by increasing m_ag_max_usable by ar_asked at
the very top of __xfs_ag_resv_init.  The perag structures are kzalloc'd,
so this won't have any effect on a freshly mounting filesystem.

I also want to warn more loudly about AGs that don't actually have
enough space to handle the reservation.

--D

> +			if (error)
> +				goto out;
> +
> +			xfs_warn(mp,
> +"Per-AG reservation for finobt failed.  File System may run out of space.\n");
> +			mp->m_inotbt_nores = true;
> +		}
>  	}
>  
>  	/* Create the AGFL metadata reservation */
>  	if (pag->pag_agfl_resv.ar_asked == 0) {
>  		ask = used = 0;
>  
> -		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
> -				&ask, &used);
> +		error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used);
>  		if (error)
>  			goto out;
>  
> @@ -256,9 +284,16 @@ xfs_ag_resv_init(
>  			goto out;
>  	}
>  
> +#ifdef DEBUG
> +	/* need to read in the AGF for the ASSERT below to work */
> +	error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0);
> +	if (error)
> +		return error;
> +
>  	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
>  	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
>  	       pag->pagf_freeblks + pag->pagf_flcount);
> +#endif
>  out:
>  	return error;
>  }
> diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
> index 0bb8966..3f24725 100644
> --- a/fs/xfs/libxfs/xfs_ialloc.h
> +++ b/fs/xfs/libxfs/xfs_ialloc.h
> @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp,
>  int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
>  		xfs_agnumber_t agno, struct xfs_buf **bpp);
>  
> -
>  #endif	/* __XFS_IALLOC_H__ */
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
> index 0fd086d..7c47188 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.c
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
> @@ -82,11 +82,12 @@ xfs_finobt_set_root(
>  }
>  
>  STATIC int
> -xfs_inobt_alloc_block(
> +__xfs_inobt_alloc_block(
>  	struct xfs_btree_cur	*cur,
>  	union xfs_btree_ptr	*start,
>  	union xfs_btree_ptr	*new,
> -	int			*stat)
> +	int			*stat,
> +	enum xfs_ag_resv_type	resv)
>  {
>  	xfs_alloc_arg_t		args;		/* block allocation args */
>  	int			error;		/* error return value */
> @@ -103,6 +104,7 @@ xfs_inobt_alloc_block(
>  	args.maxlen = 1;
>  	args.prod = 1;
>  	args.type = XFS_ALLOCTYPE_NEAR_BNO;
> +	args.resv = resv;
>  
>  	error = xfs_alloc_vextent(&args);
>  	if (error) {
> @@ -123,6 +125,27 @@ xfs_inobt_alloc_block(
>  }
>  
>  STATIC int
> +xfs_inobt_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*start,
> +	union xfs_btree_ptr	*new,
> +	int			*stat)
> +{
> +	return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE);
> +}
> +
> +STATIC int
> +xfs_finobt_alloc_block(
> +	struct xfs_btree_cur	*cur,
> +	union xfs_btree_ptr	*start,
> +	union xfs_btree_ptr	*new,
> +	int			*stat)
> +{
> +	return __xfs_inobt_alloc_block(cur, start, new, stat,
> +			XFS_AG_RESV_METADATA);
> +}
> +
> +STATIC int
>  xfs_inobt_free_block(
>  	struct xfs_btree_cur	*cur,
>  	struct xfs_buf		*bp)
> @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = {
>  
>  	.dup_cursor		= xfs_inobt_dup_cursor,
>  	.set_root		= xfs_finobt_set_root,
> -	.alloc_block		= xfs_inobt_alloc_block,
> +	.alloc_block		= xfs_finobt_alloc_block,
>  	.free_block		= xfs_inobt_free_block,
>  	.get_minrecs		= xfs_inobt_get_minrecs,
>  	.get_maxrecs		= xfs_inobt_get_maxrecs,
> @@ -480,3 +503,64 @@ xfs_inobt_rec_check_count(
>  	return 0;
>  }
>  #endif	/* DEBUG */
> +
> +static xfs_extlen_t
> +xfs_inobt_max_size(
> +	struct xfs_mount	*mp)
> +{
> +	/* Bail out if we're uninitialized, which can happen in mkfs. */
> +	if (mp->m_inobt_mxr[0] == 0)
> +		return 0;
> +
> +	return xfs_btree_calc_size(mp, mp->m_inobt_mnr,
> +		(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
> +				XFS_INODES_PER_CHUNK);
> +}
> +
> +static int
> +xfs_inobt_count_blocks(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_btnum_t		btnum,
> +	xfs_extlen_t		*tree_blocks)
> +{
> +	struct xfs_buf		*agbp;
> +	struct xfs_btree_cur	*cur;
> +	int			error;
> +
> +	error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp);
> +	if (error)
> +		return error;
> +
> +	cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum);
> +	error = xfs_btree_count_blocks(cur, tree_blocks);
> +	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
> +	xfs_buf_relse(agbp);
> +
> +	return error;
> +}
> +
> +/*
> + * Figure out how many blocks to reserve and how many are used by this btree.
> + */
> +int
> +xfs_finobt_calc_reserves(
> +	struct xfs_mount	*mp,
> +	xfs_agnumber_t		agno,
> +	xfs_extlen_t		*ask,
> +	xfs_extlen_t		*used)
> +{
> +	xfs_extlen_t		tree_len = 0;
> +	int			error;
> +
> +	if (!xfs_sb_version_hasfinobt(&mp->m_sb))
> +		return 0;
> +
> +	error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len);
> +	if (error)
> +		return error;
> +
> +	*ask += xfs_inobt_max_size(mp);
> +	*used += tree_len;
> +	return 0;
> +}
> diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h
> index bd88453..aa81e2e 100644
> --- a/fs/xfs/libxfs/xfs_ialloc_btree.h
> +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h
> @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *,
>  #define xfs_inobt_rec_check_count(mp, rec)	0
>  #endif	/* DEBUG */
>  
> +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno,
> +		xfs_extlen_t *ask, xfs_extlen_t *used);
> +
>  #endif	/* __XFS_IALLOC_BTREE_H__ */
> diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
> index b955779..de32f0f 100644
> --- a/fs/xfs/xfs_inode.c
> +++ b/fs/xfs/xfs_inode.c
> @@ -1792,22 +1792,23 @@ xfs_inactive_ifree(
>  	int			error;
>  
>  	/*
> -	 * The ifree transaction might need to allocate blocks for record
> -	 * insertion to the finobt. We don't want to fail here at ENOSPC, so
> -	 * allow ifree to dip into the reserved block pool if necessary.
> -	 *
> -	 * Freeing large sets of inodes generally means freeing inode chunks,
> -	 * directory and file data blocks, so this should be relatively safe.
> -	 * Only under severe circumstances should it be possible to free enough
> -	 * inodes to exhaust the reserve block pool via finobt expansion while
> -	 * at the same time not creating free space in the filesystem.
> +	 * We try to use a per-AG reservation for any block needed by the finobt
> +	 * tree, but as the finobt feature predates the per-AG reservation
> +	 * support a degraded file system might not have enough space for the
> +	 * reservation at mount time.  In that case try to dip into the reserved
> +	 * pool and pray.
>  	 *
>  	 * Send a warning if the reservation does happen to fail, as the inode
>  	 * now remains allocated and sits on the unlinked list until the fs is
>  	 * repaired.
>  	 */
> -	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> -			XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp);
> +	if (unlikely(mp->m_inotbt_nores)) {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree,
> +				XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE,
> +				&tp);
> +	} else {
> +		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp);
> +	}
>  	if (error) {
>  		if (error == -ENOSPC) {
>  			xfs_warn_ratelimited(mp,
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 84f7852..7f351f7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -140,6 +140,7 @@ typedef struct xfs_mount {
>  	int			m_fixedfsid[2];	/* unchanged for life of FS */
>  	uint			m_dmevmask;	/* DMI events for this FS */
>  	__uint64_t		m_flags;	/* global mount flags */
> +	bool			m_inotbt_nores; /* no per-AG finobt resv. */
>  	int			m_ialloc_inos;	/* inodes in inode allocation */
>  	int			m_ialloc_blks;	/* blocks in inode allocation */
>  	int			m_ialloc_min_blks;/* min blocks in sparse inode
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux