Re: [PATCH v3] xfs: add online scrub for superblock counters

Brian Foster <bfoster@xxxxxxxxxx> · Mon, 29 Apr 2019 08:00:29 -0400

On Sun, Apr 28, 2019 at 03:10:06PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> 
> Teach online scrub how to check the filesystem summary counters.  We use
> the incore delalloc block counter along with the incore AG headers to
> compute expected values for fdblocks, icount, and ifree, and then check
> that the percpu counter is within a certain threshold of the expected
> value.  This is done to avoid having to freeze or otherwise lock the
> filesystem, which means that we're only checking that the counters are
> fairly close, not that they're exactly correct.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> ---
> v2: move all the io generating calls into a warmup function
> v3: tighten the aggregation loop and rework the threshold check function
> ---
>  fs/xfs/Makefile           |    1 
>  fs/xfs/libxfs/xfs_fs.h    |    3 
>  fs/xfs/libxfs/xfs_types.c |    2 
>  fs/xfs/libxfs/xfs_types.h |    2 
>  fs/xfs/scrub/common.c     |    9 +
>  fs/xfs/scrub/common.h     |    2 
>  fs/xfs/scrub/fscounters.c |  360 +++++++++++++++++++++++++++++++++++++++++++++
>  fs/xfs/scrub/health.c     |    1 
>  fs/xfs/scrub/scrub.c      |    6 +
>  fs/xfs/scrub/scrub.h      |    9 +
>  fs/xfs/scrub/trace.h      |   63 ++++++++
>  11 files changed, 455 insertions(+), 3 deletions(-)
>  create mode 100644 fs/xfs/scrub/fscounters.c
> 
...
> diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c
> new file mode 100644
> index 000000000000..aeb753dd0eaa
> --- /dev/null
> +++ b/fs/xfs/scrub/fscounters.c
> @@ -0,0 +1,360 @@
...
> +/*
> + * Make sure the per-AG structure has been initialized from the on-disk header
> + * contents and that the incore counters match the ondisk counters.  Do this
> + * from the setup function so that the inner summation loop runs as quickly as
> + * possible.
> + *

I don't see this function matching incore counters against ondisk
counters anywhere.

> + * This function runs during the setup phase /before/ we start checking any
> + * metadata.
> + */
> +STATIC int
> +xchk_fscount_warmup(
> +	struct xfs_scrub	*sc)
> +{
> +	struct xfs_mount	*mp = sc->mp;
> +	struct xfs_buf		*agi_bp = NULL;
> +	struct xfs_buf		*agf_bp = NULL;
> +	struct xfs_perag	*pag = NULL;
> +	xfs_agnumber_t		agno;
> +	int			error = 0;
> +
> +	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
> +		pag = xfs_perag_get(mp, agno);
> +
> +		if (pag->pagi_init && pag->pagf_init)
> +			goto next_loop_perag;
> +
> +		/* Lock both AG headers. */
> +		error = xfs_ialloc_read_agi(mp, sc->tp, agno, &agi_bp);
> +		if (error)
> +			break;
> +		error = xfs_alloc_read_agf(mp, sc->tp, agno, 0, &agf_bp);
> +		if (error)
> +			break;
> +		error = -ENOMEM;
> +		if (!agf_bp || !agi_bp)
> +			break;
> +
> +		/*
> +		 * These are supposed to be initialized by the header read
> +		 * function.
> +		 */
> +		error = -EFSCORRUPTED;
> +		if (!pag->pagi_init || !pag->pagf_init)
> +			break;
> +
> +		xfs_buf_relse(agf_bp);
> +		agf_bp = NULL;
> +		xfs_buf_relse(agi_bp);
> +		agi_bp = NULL;
> +next_loop_perag:
> +		xfs_perag_put(pag);
> +		pag = NULL;
> +		error = 0;
> +
> +		if (fatal_signal_pending(current))
> +			break;
> +	}
> +
> +	if (agf_bp)
> +		xfs_buf_relse(agf_bp);
> +	if (agi_bp)
> +		xfs_buf_relse(agi_bp);
> +	if (pag)
> +		xfs_perag_put(pag);
> +	return error;
> +}
> +
...
> +/*
> + * Is the @counter reasonably close to the @expected value?
> + *
> + * We neither locked nor froze anything in the filesystem while aggregating the
> + * per-AG data to compute the @expected value, which means that the counter
> + * could have changed.  We know the @old_value of the summation of the counter
> + * before the aggregation, and we re-sum the counter now.  If the expected
> + * value falls between the two summations, we're ok.
> + *
> + * Otherwise, we /might/ have a problem.  If the change in the summations is
> + * more than we want to tolerate, the filesystem is probably busy and we should
> + * just send back INCOMPLETE and see if userspace will try again.
> + */
> +static inline bool
> +xchk_fscount_within_range(
> +	struct xfs_scrub	*sc,
> +	const int64_t		old_value,
> +	struct percpu_counter	*counter,
> +	uint64_t		expected)
> +{
> +	int64_t			min_value, max_value;
> +	int64_t			curr_value = percpu_counter_sum(counter);
> +
> +	trace_xchk_fscounters_within_range(sc->mp, expected, curr_value,
> +			old_value);
> +
> +	/* Negative values are always wrong. */
> +	if (curr_value < 0)
> +		return false;
> +
> +	/* Exact matches are always ok. */
> +	if (curr_value == expected)
> +		return true;
> +
> +	min_value = min(old_value, curr_value);
> +	max_value = max(old_value, curr_value);
> +
> +	/* Within the before-and-after range is ok. */
> +	if (expected >= min_value && expected <= max_value)
> +		return true;
> +

If the max/min variance is subject to restrictions, why do we allow the
expected value to pass against those values before we check the variance
below? Is the variance intended to only filter out false corruption
reports? It seems a little strange to me to establish a variance rule
like this where we don't trust them enough to indicate corruption, but
would trust them enough to confirm lack of corruption (as opposed to
telling the user "you might want to try again").

Brian

> +	/*
> +	 * If the difference between the two summations is too large, the fs
> +	 * might just be busy and so we'll mark the scrub incomplete.  Return
> +	 * true here so that we don't mark the counter corrupt.
> +	 */
> +	if (max_value - min_value >= XCHK_FSCOUNT_MIN_VARIANCE) {
> +		xchk_set_incomplete(sc);
> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/* Check the superblock counters. */
> +int
> +xchk_fscounters(
> +	struct xfs_scrub	*sc)
> +{
> +	struct xfs_mount	*mp = sc->mp;
> +	struct xchk_fscounters	*fsc = sc->buf;
> +	int64_t			icount, ifree, fdblocks;
> +	int			error;
> +
> +	/* Snapshot the percpu counters. */
> +	icount = percpu_counter_sum(&mp->m_icount);
> +	ifree = percpu_counter_sum(&mp->m_ifree);
> +	fdblocks = percpu_counter_sum(&mp->m_fdblocks);
> +
> +	/* No negative values, please! */
> +	if (icount < 0 || ifree < 0 || fdblocks < 0)
> +		xchk_set_corrupt(sc);
> +
> +	/* See if icount is obviously wrong. */
> +	if (icount < fsc->icount_min || icount > fsc->icount_max)
> +		xchk_set_corrupt(sc);
> +
> +	/* See if fdblocks is obviously wrong. */
> +	if (fdblocks > mp->m_sb.sb_dblocks)
> +		xchk_set_corrupt(sc);
> +
> +	/*
> +	 * If ifree exceeds icount by more than the minimum variance then
> +	 * something's probably wrong with the counters.
> +	 */
> +	if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE)
> +		xchk_set_corrupt(sc);
> +
> +	/* Walk the incore AG headers to calculate the expected counters. */
> +	error = xchk_fscount_aggregate_agcounts(sc, fsc);
> +	if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error))
> +		return error;
> +	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE)
> +		return 0;
> +
> +	/* Compare the in-core counters with whatever we counted. */
> +	if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, fsc->icount))
> +		xchk_set_corrupt(sc);
> +
> +	if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree))
> +		xchk_set_corrupt(sc);
> +
> +	if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks,
> +			fsc->fdblocks))
> +		xchk_set_corrupt(sc);
> +
> +	return 0;
> +}
> diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
> index 16b536aa125e..23cf8e2f25db 100644
> --- a/fs/xfs/scrub/health.c
> +++ b/fs/xfs/scrub/health.c
> @@ -109,6 +109,7 @@ static const struct xchk_health_map type_to_health_flag[XFS_SCRUB_TYPE_NR] = {
>  	[XFS_SCRUB_TYPE_UQUOTA]		= { XHG_FS,  XFS_SICK_FS_UQUOTA },
>  	[XFS_SCRUB_TYPE_GQUOTA]		= { XHG_FS,  XFS_SICK_FS_GQUOTA },
>  	[XFS_SCRUB_TYPE_PQUOTA]		= { XHG_FS,  XFS_SICK_FS_PQUOTA },
> +	[XFS_SCRUB_TYPE_FSCOUNTERS]	= { XHG_FS,  XFS_SICK_FS_COUNTERS },
>  };
>  
>  /* Return the health status mask for this scrub type. */
> diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
> index ce13c1c366db..f630389ee176 100644
> --- a/fs/xfs/scrub/scrub.c
> +++ b/fs/xfs/scrub/scrub.c
> @@ -352,6 +352,12 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
>  		.scrub	= xchk_quota,
>  		.repair	= xrep_notsupported,
>  	},
> +	[XFS_SCRUB_TYPE_FSCOUNTERS] = {	/* fs summary counters */
> +		.type	= ST_FS,
> +		.setup	= xchk_setup_fscounters,
> +		.scrub	= xchk_fscounters,
> +		.repair	= xrep_notsupported,
> +	},
>  };
>  
>  /* This isn't a stable feature, warn once per day. */
> diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
> index 01986ed364db..ad1ceb44a628 100644
> --- a/fs/xfs/scrub/scrub.h
> +++ b/fs/xfs/scrub/scrub.h
> @@ -127,6 +127,7 @@ xchk_quota(struct xfs_scrub *sc)
>  	return -ENOENT;
>  }
>  #endif
> +int xchk_fscounters(struct xfs_scrub *sc);
>  
>  /* cross-referencing helpers */
>  void xchk_xref_is_used_space(struct xfs_scrub *sc, xfs_agblock_t agbno,
> @@ -152,4 +153,12 @@ void xchk_xref_is_used_rt_space(struct xfs_scrub *sc, xfs_rtblock_t rtbno,
>  # define xchk_xref_is_used_rt_space(sc, rtbno, len) do { } while (0)
>  #endif
>  
> +struct xchk_fscounters {
> +	uint64_t		icount;
> +	uint64_t		ifree;
> +	uint64_t		fdblocks;
> +	unsigned long long	icount_min;
> +	unsigned long long	icount_max;
> +};
> +
>  #endif	/* __XFS_SCRUB_SCRUB_H__ */
> diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
> index 3c83e8b3b39c..3362bae28b46 100644
> --- a/fs/xfs/scrub/trace.h
> +++ b/fs/xfs/scrub/trace.h
> @@ -50,6 +50,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_RTSUM);
>  TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_UQUOTA);
>  TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_GQUOTA);
>  TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
> +TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
>  
>  #define XFS_SCRUB_TYPE_STRINGS \
>  	{ XFS_SCRUB_TYPE_PROBE,		"probe" }, \
> @@ -75,7 +76,8 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_PQUOTA);
>  	{ XFS_SCRUB_TYPE_RTSUM,		"rtsummary" }, \
>  	{ XFS_SCRUB_TYPE_UQUOTA,	"usrquota" }, \
>  	{ XFS_SCRUB_TYPE_GQUOTA,	"grpquota" }, \
> -	{ XFS_SCRUB_TYPE_PQUOTA,	"prjquota" }
> +	{ XFS_SCRUB_TYPE_PQUOTA,	"prjquota" }, \
> +	{ XFS_SCRUB_TYPE_FSCOUNTERS,	"fscounters" }
>  
>  DECLARE_EVENT_CLASS(xchk_class,
>  	TP_PROTO(struct xfs_inode *ip, struct xfs_scrub_metadata *sm,
> @@ -223,6 +225,7 @@ DEFINE_EVENT(xchk_block_error_class, name, \
>  		 void *ret_ip), \
>  	TP_ARGS(sc, daddr, ret_ip))
>  
> +DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_fs_error);
>  DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_error);
>  DEFINE_SCRUB_BLOCK_ERROR_EVENT(xchk_block_preen);
>  
> @@ -590,6 +593,64 @@ TRACE_EVENT(xchk_iallocbt_check_cluster,
>  		  __entry->cluster_ino)
>  )
>  
> +TRACE_EVENT(xchk_fscounters_calc,
> +	TP_PROTO(struct xfs_mount *mp, uint64_t icount, uint64_t ifree,
> +		 uint64_t fdblocks, uint64_t delalloc),
> +	TP_ARGS(mp, icount, ifree, fdblocks, delalloc),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(int64_t, icount_sb)
> +		__field(uint64_t, icount_calculated)
> +		__field(int64_t, ifree_sb)
> +		__field(uint64_t, ifree_calculated)
> +		__field(int64_t, fdblocks_sb)
> +		__field(uint64_t, fdblocks_calculated)
> +		__field(uint64_t, delalloc)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->icount_sb = mp->m_sb.sb_icount;
> +		__entry->icount_calculated = icount;
> +		__entry->ifree_sb = mp->m_sb.sb_ifree;
> +		__entry->ifree_calculated = ifree;
> +		__entry->fdblocks_sb = mp->m_sb.sb_fdblocks;
> +		__entry->fdblocks_calculated = fdblocks;
> +		__entry->delalloc = delalloc;
> +	),
> +	TP_printk("dev %d:%d icount %lld:%llu ifree %lld::%llu fdblocks %lld::%llu delalloc %llu",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->icount_sb,
> +		  __entry->icount_calculated,
> +		  __entry->ifree_sb,
> +		  __entry->ifree_calculated,
> +		  __entry->fdblocks_sb,
> +		  __entry->fdblocks_calculated,
> +		  __entry->delalloc)
> +)
> +
> +TRACE_EVENT(xchk_fscounters_within_range,
> +	TP_PROTO(struct xfs_mount *mp, uint64_t expected, int64_t curr_value,
> +		 int64_t old_value),
> +	TP_ARGS(mp, expected, curr_value, old_value),
> +	TP_STRUCT__entry(
> +		__field(dev_t, dev)
> +		__field(uint64_t, expected)
> +		__field(int64_t, curr_value)
> +		__field(int64_t, old_value)
> +	),
> +	TP_fast_assign(
> +		__entry->dev = mp->m_super->s_dev;
> +		__entry->expected = expected;
> +		__entry->curr_value = curr_value;
> +		__entry->old_value = old_value;
> +	),
> +	TP_printk("dev %d:%d expected %llu curr_value %lld old_value %lld",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->expected,
> +		  __entry->curr_value,
> +		  __entry->old_value)
> +)
> +
>  /* repair tracepoints */
>  #if IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR)
>