On Thu, Oct 05, 2017 at 03:04:52PM +1100, Dave Chinner wrote: > On Tue, Oct 03, 2017 at 01:42:30PM -0700, Darrick J. Wong wrote: > > @@ -559,3 +563,64 @@ xfs_scrub_setup_ag_btree( > > > > return xfs_scrub_ag_init(sc, sc->sm->sm_agno, &sc->sa); > > } > > + > > +/* > > + * Given an inode and the scrub control structure, grab either the > > + * inode referenced in the control structure or the inode passed in. > > + * The inode is not locked. > > + */ > > +int > > +xfs_scrub_get_inode( > > + struct xfs_scrub_context *sc, > > + struct xfs_inode *ip_in) > > +{ > > + struct xfs_mount *mp = sc->mp; > > + struct xfs_inode *ips = NULL; > > *ip? > > > + int error; > > + > > + if (sc->sm->sm_agno || (sc->sm->sm_gen && !sc->sm->sm_ino)) > > + return -EINVAL; > > What's this checking? /* * Jump out if userspace fed us an AG number or a inode generation * without an inode number. Both indicate that userspace hasn't got a * clue. */ > > + > > + /* We want to scan the inode we already had opened. */ > > + if (sc->sm->sm_ino == 0 || sc->sm->sm_ino == ip_in->i_ino) { > > + sc->ip = ip_in; > > + return 0; > > + } > > + > > + /* Look up the inode, see if the generation number matches. */ > > + if (xfs_internal_inum(mp, sc->sm->sm_ino)) > > + return -ENOENT; > > maybe xfs_internal_inum should be moved to the same place as all the > inode/agbno/bno verification functions.... Yes. > > + error = xfs_iget(mp, NULL, sc->sm->sm_ino, XFS_IGET_UNTRUSTED, > > + 0, &ips); > > I think we also want XFS_IGET_DONTCACHE here, so we don't trash the > inode cache with inodes that we use once for scrub and never touch > again. I thought about adding this, but if we let the inodes fall out of the cache now then we'll just have to load them back in for the bmap checks, right? > > + if (error == -ENOENT || error == -EINVAL) { > > + /* inode doesn't exist... */ > > + return -ENOENT; > > + } else if (error) { > > + trace_xfs_scrub_op_error(sc, > > + XFS_INO_TO_AGNO(mp, sc->sm->sm_ino), > > + XFS_INO_TO_AGBNO(mp, sc->sm->sm_ino), > > + error, __return_address); > > + return error; > > + } > > + if (VFS_I(ips)->i_generation != sc->sm->sm_gen) { > > + iput(VFS_I(ips)); > > + return -ENOENT; > > + } > > + > > + sc->ip = ips; > > + return 0; > > +} > > + > > +/* Push everything out of the log onto disk. */ > > +int > > +xfs_scrub_checkpoint_log( > > + struct xfs_mount *mp) > > +{ > > + int error; > > + > > + error = _xfs_log_force(mp, XFS_LOG_SYNC, NULL); > > + if (error) > > + return error; > > + xfs_ail_push_all_sync(mp->m_ail); > > + return 0; > > +} > > Oooo, that's a nasty thing to do on busy systems with large dirty > logs. I hope this is a "last resort" kind of thing.... It is; we only do this if the inobt says there's an inode there and the inode verifiers fail. > > +/* Set us up with an inode. */ > > What state are we trying to get the inode into here? We grab all the > various locks, but we can still have data changing via mmap pages > that are already faulted in and delalloc extents in the incore > extent list that aren't reflected on disk... > > A comment explaining what we expect here would be nice. /* * Grab total control of the inode metadata. It doesn't matter here if * the file data is still changing, we just want exclusive access to the * metadata. */ > > +int > > +xfs_scrub_setup_inode( > > + struct xfs_scrub_context *sc, > > + struct xfs_inode *ip) > > +{ > > + struct xfs_mount *mp = sc->mp; > > + int error; > > + > > + /* > > + * Try to get the inode. If the verifiers fail, we try again > > + * in raw mode. > > + */ > > + error = xfs_scrub_get_inode(sc, ip); > > + switch (error) { > > + case 0: > > + break; > > + case -EFSCORRUPTED: > > + case -EFSBADCRC: > > + return xfs_scrub_checkpoint_log(mp); > > + default: > > + return error; > > + } > > + > > + /* Got the inode, lock it and we're ready to go. */ > > + sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; > > + xfs_ilock(sc->ip, sc->ilock_flags); > > + error = xfs_scrub_trans_alloc(sc->sm, mp, &sc->tp); > > + if (error) > > + goto out_unlock; > > + sc->ilock_flags |= XFS_ILOCK_EXCL; > > + xfs_ilock(sc->ip, XFS_ILOCK_EXCL); > > Should the inode be joined to the transaction so that cancelling the > transaction unlocks the inode? Then the need for the ilock_flags > variable goes away.... This is the confluence of two semi-icky things: first, some of the scrubbers (particularly the dir and parent pointer scrubbers) will need to drop the ILOCK for short periods of time; later on, repair will want to keep the inode locked across all the repair transactions, so it makes more sense to control the lock and unlock directly. > > + > > + return error; > > +out_unlock: > > + xfs_iunlock(sc->ip, sc->ilock_flags); > > + if (sc->ip != ip) > > + iput(VFS_I(sc->ip)); > > + sc->ip = NULL; > > + return error; > > +} > > + > > +/* Inode core */ > > + > > +/* Scrub an inode. */ > > +int > > +xfs_scrub_inode( > > + struct xfs_scrub_context *sc) > > +{ > > + struct xfs_imap imap; > > + struct xfs_dinode di; > > + struct xfs_mount *mp = sc->mp; > > + struct xfs_buf *bp = NULL; > > + struct xfs_dinode *dip; > > + xfs_ino_t ino; > > + size_t fork_recs; > > + unsigned long long isize; > > + uint64_t flags2; > > + uint32_t nextents; > > + uint32_t extsize; > > + uint32_t cowextsize; > > + uint16_t flags; > > + uint16_t mode; > > + bool has_shared; > > + int error = 0; > > + > > + /* Did we get the in-core inode, or are we doing this manually? */ > > + if (sc->ip) { > > + ino = sc->ip->i_ino; > > + xfs_inode_to_disk(sc->ip, &di, 0); > > + dip = &di; > > + } else { > > + /* Map & read inode. */ > > + ino = sc->sm->sm_ino; > > + error = xfs_imap(mp, sc->tp, ino, &imap, XFS_IGET_UNTRUSTED); > > + if (error == -EINVAL) { > > + /* > > + * Inode could have gotten deleted out from under us; > > + * just forget about it. > > + */ > > + error = -ENOENT; > > + goto out; > > + } > > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > > + XFS_INO_TO_AGBNO(mp, ino), &error)) > > + goto out; > > + > > + error = xfs_trans_read_buf(mp, sc->tp, mp->m_ddev_targp, > > + imap.im_blkno, imap.im_len, XBF_UNMAPPED, &bp, > > + NULL); > > + if (!xfs_scrub_op_ok(sc, XFS_INO_TO_AGNO(mp, ino), > > + XFS_INO_TO_AGBNO(mp, ino), &error)) > > + goto out; > > + > > + /* Is this really an inode? */ > > + bp->b_ops = &xfs_inode_buf_ops; > > + dip = xfs_buf_offset(bp, imap.im_boffset); > > + if (!xfs_dinode_verify(mp, ino, dip) || > > + !xfs_dinode_good_version(mp, dip->di_version)) { > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + goto out; > > + } > > + > > + /* ...and is it the one we asked for? */ > > + if (be32_to_cpu(dip->di_gen) != sc->sm->sm_gen) { > > + error = -ENOENT; > > + goto out; > > + } > > + } > > Can we factor the manual mapping into a separate function? Just > makes it a bit cleaner and gets rid of a bunch of local variables > from this function that are just used to map and read the inode. > > ANd reading on and getting ahead of the code, could we split it > further into > > <setup dip> > > xfs_scrub_dinode(sc, ino, dip, bp); > > <do live incore inode stuff> Yes, good plan. > > + > > + flags = be16_to_cpu(dip->di_flags); > > + if (dip->di_version >= 3) > > + flags2 = be64_to_cpu(dip->di_flags2); > > + else > > + flags2 = 0; > > + > > + /* di_mode */ > > + mode = be16_to_cpu(dip->di_mode); > > + if (mode & ~(S_IALLUGO | S_IFMT)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + /* v1/v2 fields */ > > + switch (dip->di_version) { > > + case 1: > > + if (dip->di_nlink != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_mode == 0 && sc->ip) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_projid_lo != 0 || dip->di_projid_hi != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > We don't really support v1 inode format anymore - we convert it to > version 2 automatically in xfs_inode_from_disk() so the in-memory > inode is always v2 or v3, never v1. And when we write it back out, > we write it as a v2 inode, never as a v1 inode. > > Hence I'm not sure whether we should be worrying about scrubbing > such inodes - they are going to be in an ever shrinking minority > of filesystems. At minimum, they should always return "preen". Ok. I figured that we might end up at "v1 => preen" but decided to play this straight until we got to review. > > + case 2: > > + case 3: > > + if (dip->di_onlink != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_mode == 0 && sc->ip) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (dip->di_projid_hi != 0 && > > + !xfs_sb_version_hasprojid32bit(&mp->m_sb)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + default: > > + ASSERT(0); > > If we don't understand the version, it's corrupt. Yikes an assert. Will replace that stat! > > + break; > > + } > > + > > + /* > > + * di_uid/di_gid -- -1 isn't invalid, but there's no way that > > + * userspace could have created that. > > + */ > > + if (dip->di_uid == cpu_to_be32(-1U) || > > + dip->di_gid == cpu_to_be32(-1U)) > > + xfs_scrub_ino_set_warning(sc, bp); > > + > > + /* di_format */ > > + switch (dip->di_format) { > > + case XFS_DINODE_FMT_DEV: > > + if (!S_ISCHR(mode) && !S_ISBLK(mode) && > > + !S_ISFIFO(mode) && !S_ISSOCK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + if (!S_ISDIR(mode) && !S_ISLNK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_EXTENTS: > > + if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (!S_ISREG(mode) && !S_ISDIR(mode)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_UUID: > > + default: > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > + > > + /* di_size */ > > + isize = be64_to_cpu(dip->di_size); > > + if (isize & (1ULL << 63)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > Should we be checking against the on disk format size, or the > mounted filesystem maximum size (i.e. mp->m_super->s_maxbytes)? > 32 or 64 bit systems are going to have different maximum valid file > sizes.. It's perfectly valid to 'truncate -s $((2 ** 60) foofile' so the only thing we can really check for here is that the upper bit isn't set (because the VFS does not check, but barfs on, files with that large of a size). > Directories have a maximum bound size, too - the data space, leaf > space and freespace space, each of which are 32GB in size, IIRC. > > And symlinks have a different maximum size, too. Fair enough, I'll expand the i_size checks, though ISTR the verifiers now check that for us. > > + if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > > + > > + /* di_nblocks */ > > + if (flags2 & XFS_DIFLAG2_REFLINK) { > > + ; /* nblocks can exceed dblocks */ > > + } else if (flags & XFS_DIFLAG_REALTIME) { > > + if (be64_to_cpu(dip->di_nblocks) >= > > + mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > That doesn't seem right. the file can be on either the data or the > rt device, so the maximum file blocks is the size of one device or > the other, not both combined. di_nblocks is the sum of (data blocks + bmbt blocks + attr blocks), right? So in theory if you had a rt file with 1000 data blocks, 10 bmbt blocks to map the data blocks, and 100 attr blocks then di_nblocks has to be 1110. > > + } else { > > + if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } > > + > > + /* di_extsize */ > > + extsize = be32_to_cpu(dip->di_extsize); > > + if (flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) { > > + if (extsize <= 0 || extsize > MAXEXTLEN) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + if (!(flags & XFS_DIFLAG_REALTIME) && > > + extsize > mp->m_sb.sb_agblocks / 2) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } else { > > + if (extsize != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + } > > There more to validating the extentsize hints than this. From > xfs_ioctl_setattr_check_extsize(): > > /* > * extent size hint validation is somewhat cumbersome. Rules are: > * > * 1. extent size hint is only valid for directories and regular files > * 2. FS_XFLAG_EXTSIZE is only valid for regular files > * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. > * 4. can only be changed on regular files if no extents are allocated > * 5. can be changed on directories at any time > * 6. extsize hint of 0 turns off hints, clears inode flags. > * 7. Extent size must be a multiple of the appropriate block size. > * 8. for non-realtime files, the extent size hint must be limited > * to half the AG size to avoid alignment extending the extent beyond the > * limits of the AG. > */ > > Maybe there's some commonality between these two functions... Definitely, will refactor both. > > + > > + /* di_flags */ > > + if ((flags & XFS_DIFLAG_IMMUTABLE) && (flags & XFS_DIFLAG_APPEND)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > What about project id flags? > > Also, there are flags that are allowed only on regular files, and > there are flags that are only allowed on directories. Those should > probably also be checked for preening. <nod> Some of these are checked by the dinode verifier, but there needs to be a comment (or more comment) explaining that. > > + > > + /* di_nextents */ > > + nextents = be32_to_cpu(dip->di_nextents); > > + fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > + switch (dip->di_format) { > > + case XFS_DINODE_FMT_EXTENTS: > > + if (nextents > fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (nextents <= fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + case XFS_DINODE_FMT_DEV: > > + case XFS_DINODE_FMT_UUID: > > + default: > > + if (nextents != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > + > > + /* di_anextents */ > > + nextents = be16_to_cpu(dip->di_anextents); > > + fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); > > + switch (dip->di_aformat) { > > + case XFS_DINODE_FMT_EXTENTS: > > + if (nextents > fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_BTREE: > > + if (nextents <= fork_recs) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + case XFS_DINODE_FMT_LOCAL: > > + case XFS_DINODE_FMT_DEV: > > + case XFS_DINODE_FMT_UUID: > > + default: > > + if (nextents != 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + break; > > + } > > Don't we need a check here first to see whether an attribute fork > exists or not? Do you mean the xfs_inode_fork, or something else? XFS_DFORK_ASIZE returns zero if !XFS_DFORK_Q which in turn is based on di_forkoff so we're really only checking that di_aformat makes sense given the number of extents and the size of the attr fork area. We're not actually checking anything in the attr fork; that's a different scrubber. > > + > > + /* di_forkoff */ > > + if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + if (dip->di_anextents != 0 && dip->di_forkoff == 0) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > + > > + /* di_aformat */ > > + if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && > > + dip->di_aformat != XFS_DINODE_FMT_EXTENTS && > > + dip->di_aformat != XFS_DINODE_FMT_BTREE) > > + xfs_scrub_ino_set_corrupt(sc, ino, bp); > > Shouldn't this come before we use dip->di_aformat in a switch > statement? Yes. Good catch. > Hmmm - aren't we missing the same checks for the data fork? I believe you'll find them further up in the function. --D > > Cheers, > > Dave. > -- > Dave Chinner > david@xxxxxxxxxxxxx > -- > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html