On Fri, Dec 30, 2016 at 03:19:58PM +0100, Christoph Hellwig wrote: > Currently we try to rely on the global reserved block pool for block > allocations for the free inode btree, but I have customer reports > (fairly complex workload, need to find an easier reproducer) where that > is not enough as the AG where we free an inode that requires a new > finobt block is entirely full. This causes us to cancel a dirty > transaction and thus a file system shutdown. > > I think the right way to guard against this is to treat the finot the same > way as the refcount btree and have a per-AG reservations for the possible > worst case size of it, and the patch below implements that. > > Signed-off-by: Christoph Hellwig <hch@xxxxxx> > --- > fs/xfs/libxfs/xfs_ag_resv.c | 6 +++ > fs/xfs/libxfs/xfs_ialloc.h | 1 - > fs/xfs/libxfs/xfs_ialloc_btree.c | 96 ++++++++++++++++++++++++++++++++++++++-- > fs/xfs/libxfs/xfs_ialloc_btree.h | 3 ++ > fs/xfs/libxfs/xfs_trans_space.h | 3 -- > fs/xfs/xfs_inode.c | 25 ++--------- > 6 files changed, 105 insertions(+), 29 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c > index e5ebc37..592754b 100644 > --- a/fs/xfs/libxfs/xfs_ag_resv.c > +++ b/fs/xfs/libxfs/xfs_ag_resv.c > @@ -39,6 +39,7 @@ > #include "xfs_rmap_btree.h" > #include "xfs_btree.h" > #include "xfs_refcount_btree.h" > +#include "xfs_ialloc_btree.h" > > /* > * Per-AG Block Reservations > @@ -236,6 +237,11 @@ xfs_ag_resv_init( > if (error) > goto out; > > + error = xfs_finobt_calc_reserves(pag->pag_mount, > + pag->pag_agno, &ask, &used); > + if (error) > + goto out; > + > error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, > ask, used); > if (error) > diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h > index 0bb8966..3f24725 100644 > --- a/fs/xfs/libxfs/xfs_ialloc.h > +++ b/fs/xfs/libxfs/xfs_ialloc.h > @@ -168,5 +168,4 @@ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, > int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp, > xfs_agnumber_t agno, struct xfs_buf **bpp); > > - > #endif /* __XFS_IALLOC_H__ */ > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c > index 0fd086d..3aba153 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.c > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c > @@ -82,11 +82,12 @@ xfs_finobt_set_root( > } > > STATIC int > -xfs_inobt_alloc_block( > +__xfs_inobt_alloc_block( > struct xfs_btree_cur *cur, > union xfs_btree_ptr *start, > union xfs_btree_ptr *new, > - int *stat) > + int *stat, > + enum xfs_ag_resv_type resv) > { > xfs_alloc_arg_t args; /* block allocation args */ > int error; /* error return value */ > @@ -103,6 +104,7 @@ xfs_inobt_alloc_block( > args.maxlen = 1; > args.prod = 1; > args.type = XFS_ALLOCTYPE_NEAR_BNO; > + args.resv = resv; > > error = xfs_alloc_vextent(&args); > if (error) { > @@ -123,6 +125,27 @@ xfs_inobt_alloc_block( > } > > STATIC int > +xfs_inobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, XFS_AG_RESV_NONE); > +} > + > +STATIC int > +xfs_finobt_alloc_block( > + struct xfs_btree_cur *cur, > + union xfs_btree_ptr *start, > + union xfs_btree_ptr *new, > + int *stat) > +{ > + return __xfs_inobt_alloc_block(cur, start, new, stat, > + XFS_AG_RESV_METADATA); > +} > + > +STATIC int > xfs_inobt_free_block( > struct xfs_btree_cur *cur, > struct xfs_buf *bp) > @@ -328,7 +351,7 @@ static const struct xfs_btree_ops xfs_finobt_ops = { > > .dup_cursor = xfs_inobt_dup_cursor, > .set_root = xfs_finobt_set_root, > - .alloc_block = xfs_inobt_alloc_block, > + .alloc_block = xfs_finobt_alloc_block, > .free_block = xfs_inobt_free_block, > .get_minrecs = xfs_inobt_get_minrecs, > .get_maxrecs = xfs_inobt_get_maxrecs, > @@ -480,3 +503,70 @@ xfs_inobt_rec_check_count( > return 0; > } > #endif /* DEBUG */ > + > +static xfs_extlen_t > +xfs_inobt_calc_size( > + struct xfs_mount *mp, > + unsigned long long len) > +{ > + return xfs_btree_calc_size(mp, mp->m_inobt_mnr, len); This call figures out worst-case how many blocks we need to store a given number of records... > +} > + > +static xfs_extlen_t > +xfs_inobt_max_size( > + struct xfs_mount *mp) > +{ > + /* Bail out if we're uninitialized, which can happen in mkfs. */ > + if (mp->m_inobt_mxr[0] == 0) > + return 0; > + > + return xfs_inobt_calc_size(mp, mp->m_sb.sb_agblocks); ...and so here we calculate the number of blocks needed to store the maximum number of finobt records possible for an AG. IIRC, each *inobt record refers to a single chunk of 64 inodes (or at least a theoretical chunk in the spinodes=1 case), so I think we can reduce the reservation to... nr = m_sb.sb_agblocks * m_sb.sb_inopblock / XFS_INODES_PER_CHUNK; return xfs_inobt_calc_size(mp, nr); ...right? For rmap/refcount the maximum number of records is the number of blocks in the AG. > +} > + > +static int > +xfs_inobt_count_blocks( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_btnum_t btnum, > + xfs_extlen_t *tree_blocks) > +{ > + struct xfs_buf *agbp; > + struct xfs_btree_cur *cur; > + int error; > + > + error = xfs_ialloc_read_agi(mp, NULL, agno, &agbp); > + if (error) > + return error; > + > + cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno, btnum); > + error = xfs_btree_count_blocks(cur, tree_blocks); > + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); > + xfs_buf_relse(agbp); > + > + return error; > +} > + > +/* > + * Figure out how many blocks to reserve and how many are used by this btree. > + */ > +int > +xfs_finobt_calc_reserves( > + struct xfs_mount *mp, > + xfs_agnumber_t agno, > + xfs_extlen_t *ask, > + xfs_extlen_t *used) > +{ > + xfs_extlen_t tree_len = 0; > + int error; > + > + if (!xfs_sb_version_hasfinobt(&mp->m_sb)) > + return 0; > + > + error = xfs_inobt_count_blocks(mp, agno, XFS_BTNUM_FINO, &tree_len); This requires us to traverse all the blocks in the finobt at mount time, which isn't necessarily quick. For refcount/rmap we cache the number of tree blocks in the AGF to speed this up... but it was easy to sneak that into the disk format. :) For finobt I wonder if one could defer the block counting work to a separate thread if the AG has enough free blocks to cover, say, 10x the maximum reservation? Though that could be racy and maybe finobts are small enough that the impact on mount time is low anyway? There's also the unsolved problem of what happens if we mount and find agf_freeblks < (sum(ask) - sum(used)) -- right now we eat that state and hope that we don't later ENOSPC and crash. For reflink and rmap we will have always had the AG reservation and therefore it should never happen that we have fewer free blocks than reserved blocks. (Unless the user does something pathological like using CoW to create many billions of separate rmap records...) But as for retroactively adding AG reservations for an existing tree, I guess we'll have to come up with a strategy for dealing with insufficient free blocks. I suppose one could try to use xfs_fsr to move large contiguous extents to a less full AG, if there are any... (I actually hit the "insufficient freeblks for AG reservation" case over the holiday when I "upgraded" an XFS to rmap+reflink the foolish way and neglected to ensure the free space...) --D > + if (error) > + return error; > + > + *ask += xfs_inobt_max_size(mp); > + *used += tree_len; > + return 0; > +} > diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h > index bd88453..aa81e2e 100644 > --- a/fs/xfs/libxfs/xfs_ialloc_btree.h > +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h > @@ -72,4 +72,7 @@ int xfs_inobt_rec_check_count(struct xfs_mount *, > #define xfs_inobt_rec_check_count(mp, rec) 0 > #endif /* DEBUG */ > > +int xfs_finobt_calc_reserves(struct xfs_mount *mp, xfs_agnumber_t agno, > + xfs_extlen_t *ask, xfs_extlen_t *used); > + > #endif /* __XFS_IALLOC_BTREE_H__ */ > diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h > index 7917f6e..6db3e2d 100644 > --- a/fs/xfs/libxfs/xfs_trans_space.h > +++ b/fs/xfs/libxfs/xfs_trans_space.h > @@ -94,8 +94,5 @@ > (XFS_DIRREMOVE_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl)) > #define XFS_SYMLINK_SPACE_RES(mp,nl,b) \ > (XFS_IALLOC_SPACE_RES(mp) + XFS_DIRENTER_SPACE_RES(mp,nl) + (b)) > -#define XFS_IFREE_SPACE_RES(mp) \ > - (xfs_sb_version_hasfinobt(&mp->m_sb) ? (mp)->m_in_maxlevels : 0) > - > > #endif /* __XFS_TRANS_SPACE_H__ */ > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index b955779..0283ab6 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -1792,30 +1792,11 @@ xfs_inactive_ifree( > int error; > > /* > - * The ifree transaction might need to allocate blocks for record > - * insertion to the finobt. We don't want to fail here at ENOSPC, so > - * allow ifree to dip into the reserved block pool if necessary. > - * > - * Freeing large sets of inodes generally means freeing inode chunks, > - * directory and file data blocks, so this should be relatively safe. > - * Only under severe circumstances should it be possible to free enough > - * inodes to exhaust the reserve block pool via finobt expansion while > - * at the same time not creating free space in the filesystem. > - * > - * Send a warning if the reservation does happen to fail, as the inode > - * now remains allocated and sits on the unlinked list until the fs is > - * repaired. > + * We use a per-AG reservation for any block needed by the finobt tree. > */ > - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, > - XFS_IFREE_SPACE_RES(mp), 0, XFS_TRANS_RESERVE, &tp); > + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ifree, 0, 0, 0, &tp); > if (error) { > - if (error == -ENOSPC) { > - xfs_warn_ratelimited(mp, > - "Failed to remove inode(s) from unlinked list. " > - "Please free space, unmount and run xfs_repair."); > - } else { > - ASSERT(XFS_FORCED_SHUTDOWN(mp)); > - } > + ASSERT(XFS_FORCED_SHUTDOWN(mp)); > return error; > } > > -- > 2.1.4 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html