Re: [RFC PATCH 4/4] xfs: support shrinking empty AGs

Gao Xiang <hsiangkao@xxxxxxxxxx> · Thu, 15 Apr 2021 13:22:26 +0800

Hi Dave,

On Thu, Apr 15, 2021 at 02:25:49PM +1000, Dave Chinner wrote:
> On Thu, Apr 15, 2021 at 03:52:40AM +0800, Gao Xiang wrote:
> > Roughly, freespace can be shrinked atomicly with the following steps:
> > 
> >  - make sure the pending-for-discard AGs are all stablized as empty;
> >  - a transaction to
> >      fix up freespace btrees for the target tail AG;
> >      decrease agcount to the target value.
> > 
> > In order to make sure such AGs are empty, first to mark them as
> > inactive, then check if these AGs are all empty one-by-one. Also
> > need to log force, complete all discard operations together.
> > 
> > Even it's needed to drain out all related cached buffers in case of
> > corrupt fs if growfs again, so ail items are all needed to be pushed
> > out as well.
> > 
> > Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxx>
> > ---
> >  fs/xfs/libxfs/xfs_ag.c |   1 -
> >  fs/xfs/libxfs/xfs_ag.h |   2 +-
> >  fs/xfs/xfs_fsops.c     | 148 ++++++++++++++++++++++++++++++++++++++---
> >  fs/xfs/xfs_mount.c     |  87 +++++++++++++++++++-----
> >  fs/xfs/xfs_trans.c     |   1 -
> >  5 files changed, 210 insertions(+), 29 deletions(-)
> > 
> > diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
> > index ba5702e5c9ad..eb370fbae192 100644
> > --- a/fs/xfs/libxfs/xfs_ag.c
> > +++ b/fs/xfs/libxfs/xfs_ag.c
> > @@ -512,7 +512,6 @@ xfs_ag_shrink_space(
> >  	struct xfs_agf		*agf;
> >  	int			error, err2;
> >  
> > -	ASSERT(agno == mp->m_sb.sb_agcount - 1);
> >  	error = xfs_ialloc_read_agi(mp, *tpp, agno, &agibp);
> >  	if (error)
> >  		return error;
> > diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
> > index 4535de1d88ea..7031e2c7ef66 100644
> > --- a/fs/xfs/libxfs/xfs_ag.h
> > +++ b/fs/xfs/libxfs/xfs_ag.h
> > @@ -15,7 +15,7 @@ struct aghdr_init_data {
> >  	xfs_agblock_t		agno;		/* ag to init */
> >  	xfs_extlen_t		agsize;		/* new AG size */
> >  	struct list_head	buffer_list;	/* buffer writeback list */
> > -	xfs_rfsblock_t		nfree;		/* cumulative new free space */
> > +	int64_t			nfree;		/* cumulative new free space */
> >  
> >  	/* per header data */
> >  	xfs_daddr_t		daddr;		/* header location */
> > diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
> > index b33c894b6cf3..659ca1836bba 100644
> > --- a/fs/xfs/xfs_fsops.c
> > +++ b/fs/xfs/xfs_fsops.c
> > @@ -14,11 +14,14 @@
> >  #include "xfs_trans.h"
> >  #include "xfs_error.h"
> >  #include "xfs_alloc.h"
> > +#include "xfs_ialloc.h"
> > +#include "xfs_extent_busy.h"
> >  #include "xfs_fsops.h"
> >  #include "xfs_trans_space.h"
> >  #include "xfs_log.h"
> >  #include "xfs_ag.h"
> >  #include "xfs_ag_resv.h"
> > +#include "xfs_trans_priv.h"
> >  
> >  /*
> >   * Write new AG headers to disk. Non-transactional, but need to be
> > @@ -78,6 +81,112 @@ xfs_resizefs_init_new_ags(
> >  	return error;
> >  }
> >  
> > +static int
> > +xfs_shrinkfs_deactivate_ags(
> > +	struct xfs_mount        *mp,
> > +	xfs_agnumber_t		oagcount,
> > +	xfs_agnumber_t		nagcount)
> > +{
> > +	xfs_agnumber_t		agno;
> > +	int			error;
> > +
> > +	/* confirm AGs pending for shrinking are all inactive */
> > +	for (agno = nagcount; agno < oagcount; ++agno) {
> > +		struct xfs_buf *agfbp, *agibp;
> > +		struct xfs_perag *pag = xfs_perag_get(mp, agno);
> > +
> > +		down_write(&pag->pag_inactive_rwsem);
> > +		/* need to lock agi, agf buffers here to close all races */
> > +		error = xfs_read_agi(mp, NULL, agno, &agibp);
> > +		if (!error) {
> > +			error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agfbp);
> > +			if (!error) {
> > +				pag->pag_inactive = true;
> > +				xfs_buf_relse(agfbp);
> > +			}
> > +			xfs_buf_relse(agibp);
> > +		}
> > +		up_write(&pag->pag_inactive_rwsem);
> > +		xfs_perag_put(pag);
> > +		if (error)
> > +			break;
> > +	}
> > +	return error;
> > +}
> 
> Hmmmm. Ok, that's why the first patch had the specific locking
> pattern it had, because once the AGI is locked under the
> inactive_rwsem. This seems ... fragile. It relies on the code
> looking up the perag to check the pag->pag_inactive flag before it
> takes an AGF or AGI lock, but does not allow a caller than has
> an AGI or AGF locked to take the inactive_sem to check if the per-ag
> is inactive or not. It's a one-way locking mechanism...

It guarantees that when AGF, AGI locked, pag_inactive won't be
switched, and before taking AGF or AGI, pag_inactive_sem should
be taken to confirm AGF, AGI can be read. That is the way that
I can think out with much less invasion than touch more XFS
codebase....

> 
> I'd much prefer active/passive references here, and the order of
> AG inactivation is highest to lowest...
> 
> static void
> xfs_shrinkfs_deactivate_ags(
> 	struct xfs_mount        *mp,
> 	xfs_agnumber_t		oagcount,
> 	xfs_agnumber_t		nagcount)
> {
> 	xfs_agnumber_t		agno;
> 
> 	for (agno = oagcount - 1; agno >= nagcount; agno--) {
> 		struct xfs_perag *pag = xfs_perag_get(mp, agno);
> 
> 		/* drop active reference */
> 		xfs_perag_put_active(pag);
> 
> 		/* wait for pag->pag_active_refs to hit zero */
> 		.....
> 
> 		xfs_perag_put(pag);
> 	}
> }
> 
> At this point, we know there are going to be no new racing accesses
> to the perags...
> 
> 
> > +static void
> > +xfs_shrinkfs_activate_ags(
> > +	struct xfs_mount        *mp,
> > +	xfs_agnumber_t		oagcount,
> > +	xfs_agnumber_t		nagcount)
> > +{
> > +	xfs_agnumber_t		agno;
> > +
> > +	for (agno = nagcount; agno < oagcount; ++agno) {
> > +		struct xfs_perag *pag = xfs_perag_get(mp, agno);
> > +
> > +		down_write(&pag->pag_inactive_rwsem);
> > +		pag->pag_inactive = false;
> > +		up_write(&pag->pag_inactive_rwsem);
> > +	}
> > +}
> 
> static void
> xfs_shrinkfs_reactivate_ags(
> 	struct xfs_mount        *mp,
> 	xfs_agnumber_t		oagcount,
> 	xfs_agnumber_t		nagcount)
> {
> 	xfs_agnumber_t		agno;
> 
> 	for (agno = oagcount - 1; agno >= nagcount; agno--) {
> 		struct xfs_perag *pag = xfs_perag_get(mp, agno);
> 
> 		/* get a new active reference for the mount */
> 		atomic_inc(&pag->pag_active_ref);
> 
> 		xfs_perag_put(pag);
> 	}
> }
> 
> 
> > +
> > +static int
> > +xfs_shrinkfs_prepare_ags(
> > +	struct xfs_mount        *mp,
> > +	struct aghdr_init_data	*id,
> > +	xfs_agnumber_t		oagcount,
> > +	xfs_agnumber_t		nagcount)
> > +{
> > +	xfs_agnumber_t		agno;
> > +	int 			error;
> > +
> > +	error = xfs_shrinkfs_deactivate_ags(mp, oagcount, nagcount);
> > +	if (error)
> > +		goto err_out;
> 
> Waiting for active references to drain means this can't fail.
> 
> > +
> > +	/* confirm AGs pending for shrinking are all empty */
> > +	for (agno = nagcount; agno < oagcount; ++agno) {
> 
> Again, needs to work from last AG back to first.

ok.

> 
> > +		struct xfs_buf		*agfbp;
> > +		struct xfs_perag	*pag;
> > +
> > +		error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agfbp);
> > +		if (error)
> > +			goto err_out;
> > +
> > +		pag = agfbp->b_pag;
> > +		error = xfs_ag_resv_free(pag);
> > +		if (!error) {
> > +			error = xfs_ag_is_empty(agfbp);
> > +			if (!error) {
> > +				ASSERT(!pag->pagf_flcount);
> > +				id->nfree -= pag->pagf_freeblks;
> > +			}
> > +		}
> 
> Please don't nest "if (!error)" statements like this. It results in
> excessive indent in the code, and it makes it harder to determine
> what the actual error handling for failure is.

ok.

> 
> > +		xfs_buf_relse(agfbp);
> > +		if (error)
> > +			goto err_out;
> > +	}
> > +	xfs_log_force(mp, XFS_LOG_SYNC);
> 
> What does this do,

It just makes sure that no ongoing write transactions before tearing
down AGs. The reason it was here about AGFL drain, but since it's a
sync transaction, so I think it should be raised up.

> and why is it not needed before we try to free
> reservations and determine if the AG is empty?

Yeah, but it can only cause false nagative, anyway, I will raise it
up.

> 
> > +	/*
> > +	 * Wait for all busy extents to be freed, including completion of
> > +	 * any discard operation.
> > +	 */
> > +	xfs_extent_busy_wait_all(mp);
> > +	flush_workqueue(xfs_discard_wq);
> 
> Shouldn't this happen before we start trying to tear down the AGs?

May I ask what's your suggestted place? since the AGs are already
inactive here.

> 
> > +
> > +	/*
> > +	 * Also need to drain out all related cached buffers, at least,
> > +	 * in case of growfs back later (which uses uncached buffers.)
> > +	 */
> > +	xfs_ail_push_all_sync(mp->m_ail);
> > +	xfs_buftarg_drain(mp->m_ddev_targp);
> 
> Urk, no, this can livelock on active filesystems.
> 
> What you want to do is drain the per-ag buffer cache, not the global
> filesystem LRU. Given that, at this point, all the buffers still
> cached in the per-ag should have zero references to them, a walk of
> the rbtree taking a reference to each buffer, marking it stale and
> then calling xfs_buf_rele() on it should be sufficient to free all
> the buffers in the AG and release all the remaining passive
> references to the struct perag for the AG.
> 

I understand the issue, yeah, it'd be much better to use
xfs_buftarg_drain() here. Thanks for your suggestion about this.

> At this point, we can remove the perag from the m_perag radix tree,
> do the final teardown on it, and free if via call_rcu()....

I still think active/passive reference approach causes a lot of
random modification all over the whole XFS codebase since it
assumes current perag won't be removed/freed even reference count
reaches zero, adding new active reference counts in principle
sounds better yet a bit far away from the current XFS codebase
status.

Thanks,
Gao Xiang

> 
> > +	return error;
> > +err_out:
> > +	xfs_shrinkfs_activate_ags(mp, oagcount, nagcount);
> > +	return error;
> > +}
> > +
> >  /*
> >   * growfs operations
> >   */
> > @@ -93,7 +202,7 @@ xfs_growfs_data_private(
> >  	xfs_rfsblock_t		nb, nb_div, nb_mod;
> >  	int64_t			delta;
> >  	bool			lastag_extended;
> > -	xfs_agnumber_t		oagcount;
> > +	xfs_agnumber_t		oagcount, agno;
> >  	struct xfs_trans	*tp;
> >  	struct aghdr_init_data	id = {};
> >  
> > @@ -130,14 +239,13 @@ xfs_growfs_data_private(
> >  	oagcount = mp->m_sb.sb_agcount;
> >  
> >  	/* allocate the new per-ag structures */
> > -	if (nagcount > oagcount) {
> > +	if (nagcount > oagcount)
> >  		error = xfs_initialize_perag(mp, nagcount, &nagimax);
> > -		if (error)
> > -			return error;
> > -	} else if (nagcount < oagcount) {
> > -		/* TODO: shrinking the entire AGs hasn't yet completed */
> > -		return -EINVAL;
> > -	}
> > +	else if (nagcount < oagcount)
> > +		error = xfs_shrinkfs_prepare_ags(mp, &id, oagcount, nagcount);
> > +
> > +	if (error)
> > +		return error;
> >  
> >  	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata,
> >  			(delta > 0 ? XFS_GROWFS_SPACE_RES(mp) : -delta), 0,
> > @@ -151,13 +259,29 @@ xfs_growfs_data_private(
> >  	} else {
> >  		static struct ratelimit_state shrink_warning = \
> >  			RATELIMIT_STATE_INIT("shrink_warning", 86400 * HZ, 1);
> > +		xfs_agblock_t	agdelta;
> > +
> >  		ratelimit_set_flags(&shrink_warning, RATELIMIT_MSG_ON_RELEASE);
> >  
> >  		if (__ratelimit(&shrink_warning))
> >  			xfs_alert(mp,
> >  	"EXPERIMENTAL online shrink feature in use. Use at your own risk!");
> >  
> > -		error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, -delta);
> > +		for (agno = nagcount; agno < oagcount; ++agno) {
> > +			struct xfs_perag *pag = xfs_perag_get(mp, agno);
> > +
> > +			pag->pagf_freeblks = 0;
> > +			pag->pagf_longest = 0;
> > +			xfs_perag_put(pag);
> > +		}
> > +
> > +		xfs_trans_agblocks_delta(tp, id.nfree);
> > +
> > +		if (nagcount != oagcount)
> > +			agdelta = nagcount * mp->m_sb.sb_agblocks - nb;
> > +		else
> > +			agdelta = -delta;
> > +		error = xfs_ag_shrink_space(mp, &tp, nagcount - 1, agdelta);
> >  	}
> >  	if (error)
> >  		goto out_trans_cancel;
> > @@ -167,8 +291,10 @@ xfs_growfs_data_private(
> >  	 * seen by the rest of the world until the transaction commit applies
> >  	 * them atomically to the superblock.
> >  	 */
> > -	if (nagcount > oagcount)
> > -		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT, nagcount - oagcount);
> > +	if (nagcount != oagcount)
> > +		xfs_trans_mod_sb(tp, XFS_TRANS_SB_AGCOUNT,
> > +				 (int64_t)nagcount - (int64_t)oagcount);
> > +
> >  	if (delta)
> >  		xfs_trans_mod_sb(tp, XFS_TRANS_SB_DBLOCKS, delta);
> >  	if (id.nfree)
> > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
> > index 69a60b5f4a32..ca9513fdc09e 100644
> > --- a/fs/xfs/xfs_mount.c
> > +++ b/fs/xfs/xfs_mount.c
> > @@ -172,6 +172,47 @@ xfs_sb_validate_fsb_count(
> >  	return 0;
> >  }
> >  
> > +static int
> > +xfs_perag_reset(
> > +	struct xfs_perag	*pag)
> > +{
> > +	int	error;
> > +
> > +	spin_lock_init(&pag->pag_ici_lock);
> > +	INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
> > +	INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
> > +
> > +	error = xfs_buf_hash_init(pag);
> > +	if (error)
> > +		return error;
> > +
> > +	init_waitqueue_head(&pag->pagb_wait);
> > +	spin_lock_init(&pag->pagb_lock);
> > +	pag->pagb_count = 0;
> > +	pag->pagb_tree = RB_ROOT;
> > +
> > +	error = xfs_iunlink_init(pag);
> > +	if (error) {
> > +		xfs_buf_hash_destroy(pag);
> > +		return error;
> > +	}
> > +	spin_lock_init(&pag->pag_state_lock);
> > +	return 0;
> > +}
> > +
> > +static int
> > +xfs_perag_inactive_reset(
> > +	struct xfs_perag	*pag)
> > +{
> > +	cancel_delayed_work_sync(&pag->pag_blockgc_work);
> > +	xfs_iunlink_destroy(pag);
> > +	xfs_buf_hash_destroy(pag);
> > +
> > +	memset((char *)pag + offsetof(struct xfs_perag, pag_inactive), 0,
> > +	       sizeof(*pag) - offsetof(struct xfs_perag, pag_inactive));
> > +	return xfs_perag_reset(pag);
> > +}
> > +
> >  int
> >  xfs_initialize_perag(
> >  	xfs_mount_t	*mp,
> > @@ -180,6 +221,8 @@ xfs_initialize_perag(
> >  {
> >  	xfs_agnumber_t	index;
> >  	xfs_agnumber_t	first_initialised = NULLAGNUMBER;
> > +	xfs_agnumber_t	first_inactive = NULLAGNUMBER;
> > +	xfs_agnumber_t	last_inactive = NULLAGNUMBER;
> >  	xfs_perag_t	*pag;
> >  	int		error = -ENOMEM;
> >  
> > @@ -191,6 +234,20 @@ xfs_initialize_perag(
> >  	for (index = 0; index < agcount; index++) {
> >  		pag = xfs_perag_get(mp, index);
> >  		if (pag) {
> > +			down_write(&pag->pag_inactive_rwsem);
> > +			if (pag->pag_inactive) {
> > +				error = xfs_perag_inactive_reset(pag);
> > +				if (error) {
> > +					pag->pag_inactive = true;
> > +					up_write(&pag->pag_inactive_rwsem);
> > +					xfs_perag_put(pag);
> > +					goto out_unwind_new_pags;
> > +				}
> > +				if (first_inactive == NULLAGNUMBER)
> > +					first_inactive = index;
> > +				last_inactive = index;
> > +			}
> > +			up_write(&pag->pag_inactive_rwsem);
> 
> This should all go away if the perags have already been removed from
> the tree. In fact, you shouldn't need to call xfs_initialize_perag()
> at all for the shrink case that removes whole AGs....
> 
> CHeers,
> 
> Dave.
> -- 
> Dave Chinner
> david@xxxxxxxxxxxxx
>