Re: [PATCH 13/25] xfs: scrub inode btrees

Dave Chinner <david@xxxxxxxxxxxxx> · Thu, 5 Oct 2017 13:08:10 +1100

On Tue, Oct 03, 2017 at 01:42:11PM -0700, Darrick J. Wong wrote:
> +/*
> + * Set us up to scrub inode btrees.
> + * If we detect a discrepancy between the inobt and the inode,
> + * try again after forcing logged inode cores out to disk.
> + */
> +int
> +xfs_scrub_setup_ag_iallocbt(
> +	struct xfs_scrub_context	*sc,
> +	struct xfs_inode		*ip)
> +{
> +	return xfs_scrub_setup_ag_btree(sc, ip, sc->try_harder);
> +}
> +
> +/* Inode btree scrubber. */
> +
> +/* Is this chunk worth checking? */
> +STATIC bool
> +xfs_scrub_iallocbt_chunk(
> +	struct xfs_scrub_btree		*bs,
> +	struct xfs_inobt_rec_incore	*irec,
> +	xfs_agino_t			agino,
> +	xfs_extlen_t			len)
> +{
> +	struct xfs_mount		*mp = bs->cur->bc_mp;
> +	struct xfs_agf			*agf;
> +	unsigned long long		rec_end;
> +	xfs_agblock_t			eoag;
> +	xfs_agblock_t			bno;
> +
> +	agf = XFS_BUF_TO_AGF(bs->sc->sa.agf_bp);
> +	eoag = be32_to_cpu(agf->agf_length);

Probably should use the AGI for this.

> +	bno = XFS_AGINO_TO_AGBNO(mp, agino);
> +	rec_end = (unsigned long long)bno + len;
> +
> +	if (bno >= mp->m_sb.sb_agblocks || bno >= eoag ||
> +	    rec_end > mp->m_sb.sb_agblocks || rec_end > eoag) {

Same comment as last patch.

> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +		return false;
> +	}
> +
> +	return true;
> +}

I note there is no check on the length passed in for the inode
chunk - should that be verified?

> +
> +/* Count the number of free inodes. */
> +static unsigned int
> +xfs_scrub_iallocbt_freecount(
> +	xfs_inofree_t			freemask)
> +{
> +	int				bits = XFS_INODES_PER_CHUNK;
> +	unsigned int			ret = 0;
> +
> +	while (bits--) {
> +		if (freemask & 1)
> +			ret++;
> +		freemask >>= 1;
> +	}

Seems a little cumbersome. Perhaps a loop using xfs_next_bit()
might be a bit faster, something like:

	nextbit = xfs_next_bit(&freemask, 1, 0); 
	while (nextbit != -1) {
		ret++;
		nextbit = xfs_next_bit(&freemask, 1, nextbit + 1);
	}

> +/* Check a particular inode with ir_free. */
> +STATIC int
> +xfs_scrub_iallocbt_check_cluster_freemask(
> +	struct xfs_scrub_btree		*bs,
> +	xfs_ino_t			fsino,
> +	xfs_agino_t			chunkino,
> +	xfs_agino_t			clusterino,
> +	struct xfs_inobt_rec_incore	*irec,
> +	struct xfs_buf			*bp)
> +{
> +	struct xfs_dinode		*dip;
> +	struct xfs_mount		*mp = bs->cur->bc_mp;
> +	bool				freemask_ok;
> +	bool				inuse;
> +	int				error;
> +
> +	dip = xfs_buf_offset(bp, clusterino * mp->m_sb.sb_inodesize);
> +	if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
> +	    (dip->di_version >= 3 &&
> +	     be64_to_cpu(dip->di_ino) != fsino + clusterino)) {
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +		goto out;
> +	}
> +
> +	freemask_ok = !!(irec->ir_free & XFS_INOBT_MASK(chunkino + clusterino));

No need for !!(...) for a bool type - the compiler will squash it
down to 0/1 autmoatically.

> +	error = xfs_icache_inode_is_allocated(mp, bs->cur->bc_tp,
> +			fsino + clusterino, &inuse);
> +	if (error == -ENODATA) {
> +		/* Not cached, just read the disk buffer */

I think that is wrong. xfs_icache_inode_is_allocated() returns
-ENOENT if the inode is not in cache....

> +		freemask_ok ^= !!(dip->di_mode);
> +		if (!bs->sc->try_harder && !freemask_ok)
> +			return -EDEADLOCK;
> +	} else if (error < 0) {
> +		/* Inode is only half assembled, don't bother. */
> +		freemask_ok = true;

Or we had an IO error looking it up. i.e. -EAGAIN is the "half
assembled" state (i.e. in the XFS_INEW state) or the half
*disasembled* state (i.e. XFS_IRECLAIMABLE), anything
else is an error...

> +	} else {
> +		/* Inode is all there. */
> +		freemask_ok ^= inuse;

So inuse is returned from a mode check after iget succeeds. The mode
isn't zeroed until  /after/ XFS_IRECLAIMABLE is set, but it's also
set before XFS_INEW is cleared.  IOWs, how can
xfs_icache_inode_is_allocated() report anything
other than inuse == true here? If that's the case, what's the point
of the mode check inside xfs_icache_inode_is_allocated()?

> +	}
> +	if (!freemask_ok)
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +out:
> +	return 0;
> +}
> +
> +/* Make sure the free mask is consistent with what the inodes think. */
> +STATIC int
> +xfs_scrub_iallocbt_check_freemask(
> +	struct xfs_scrub_btree		*bs,
> +	struct xfs_inobt_rec_incore	*irec)
> +{
> +	struct xfs_owner_info		oinfo;
> +	struct xfs_imap			imap;
> +	struct xfs_mount		*mp = bs->cur->bc_mp;
> +	struct xfs_dinode		*dip;
> +	struct xfs_buf			*bp;
> +	xfs_ino_t			fsino;
> +	xfs_agino_t			nr_inodes;
> +	xfs_agino_t			agino;
> +	xfs_agino_t			chunkino;
> +	xfs_agino_t			clusterino;
> +	xfs_agblock_t			agbno;
> +	int				blks_per_cluster;
> +	uint16_t			holemask;
> +	uint16_t			ir_holemask;
> +	int				error = 0;
> +
> +	/* Make sure the freemask matches the inode records. */
> +	blks_per_cluster = xfs_icluster_size_fsb(mp);
> +	nr_inodes = XFS_OFFBNO_TO_AGINO(mp, blks_per_cluster, 0);

Does this setup and loop work for the case where we have 64k
filesystem blocks and so two or more inode chunks per filesystem
block (i.e. ppc64)? 

> +/* Scrub an inobt/finobt record. */
> +STATIC int
> +xfs_scrub_iallocbt_helper(
> +	struct xfs_scrub_btree		*bs,
> +	union xfs_btree_rec		*rec)
> +{
> +	struct xfs_mount		*mp = bs->cur->bc_mp;
> +	struct xfs_agi			*agi;
> +	struct xfs_inobt_rec_incore	irec;
> +	uint64_t			holes;
> +	xfs_agino_t			agino;
> +	xfs_agblock_t			agbno;
> +	xfs_extlen_t			len;
> +	int				holecount;
> +	int				i;
> +	int				error = 0;
> +	unsigned int			real_freecount;
> +	uint16_t			holemask;
> +
> +	xfs_inobt_btrec_to_irec(mp, rec, &irec);
> +
> +	if (irec.ir_count > XFS_INODES_PER_CHUNK ||
> +	    irec.ir_freecount > XFS_INODES_PER_CHUNK)
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +
> +	real_freecount = irec.ir_freecount +
> +			(XFS_INODES_PER_CHUNK - irec.ir_count);
> +	if (real_freecount != xfs_scrub_iallocbt_freecount(irec.ir_free))
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +
> +	agi = XFS_BUF_TO_AGI(bs->sc->sa.agi_bp);
> +	agino = irec.ir_startino;
> +	agbno = XFS_AGINO_TO_AGBNO(mp, irec.ir_startino);
> +	if (agbno >= be32_to_cpu(agi->agi_length)) {

Validate as per every other agbno?

> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +		goto out;
> +	}
> +
> +	if ((agbno & (xfs_ialloc_cluster_alignment(mp) - 1)) ||
> +	    (agbno & (xfs_icluster_size_fsb(mp) - 1)))
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);

What's the magic masking checks being done here? (comment?)

> +	/* Handle non-sparse inodes */
> +	if (!xfs_inobt_issparse(irec.ir_holemask)) {
> +		len = XFS_B_TO_FSB(mp,
> +				XFS_INODES_PER_CHUNK * mp->m_sb.sb_inodesize);
> +		if (irec.ir_count != XFS_INODES_PER_CHUNK)
> +			xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +
> +		if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
> +			goto out;
> +		goto check_freemask;
> +	}
> +
> +	/* Check each chunk of a sparse inode cluster. */
> +	holemask = irec.ir_holemask;
> +	holecount = 0;
> +	len = XFS_B_TO_FSB(mp,
> +			XFS_INODES_PER_HOLEMASK_BIT * mp->m_sb.sb_inodesize);
> +	holes = ~xfs_inobt_irec_to_allocmask(&irec);
> +	if ((holes & irec.ir_free) != holes ||
> +	    irec.ir_freecount > irec.ir_count)
> +		xfs_scrub_btree_set_corrupt(bs->sc, bs->cur, 0);
> +
> +	for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; holemask >>= 1,
> +			i++, agino += XFS_INODES_PER_HOLEMASK_BIT) {

Urk. THat's a bit hard to read.

> +		if (holemask & 1) {
> +			holecount += XFS_INODES_PER_HOLEMASK_BIT;
> +			continue;
> +		}
> +
> +		if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
> +			break;
> +	}

How about

	for (i = 0; i < XFS_INOBT_HOLEMASK_BITS; i++) {
		if (holemask & 1) {
			holecount += XFS_INODES_PER_HOLEMASK_BIT;
		} else if (!xfs_scrub_iallocbt_chunk(bs, &irec, agino, len))
			break;

		holemask >>= 1;
		agino += XFS_INODES_PER_HOLEMASK_BIT;
	}

Cheers,

Dave.
-- 
Dave Chinner
david@xxxxxxxxxxxxx
--
To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html