On Thu, Jun 04, 2020 at 05:46:00PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Once we have inodes pinning the cluster buffer and attached whenever > they are dirty, we no longer have a guarantee that the items are > flush locked when we lock the cluster buffer. Hence we cannot just > walk the buffer log item list and modify the attached inodes. > > If the inode is not flush locked, we have to ILOCK it first and then > flush lock it to do all the prerequisite checks needed to avoid > races with other code. This is already handled by > xfs_ifree_get_one_inode(), so rework the inode iteration loop and > function to update all inodes in cache whether they are attached to > the buffer or not. > > Note: we also remove the copying of the log item lsn to the > ili_flush_lsn as xfs_iflush_done() now uses the XFS_ISTALE flag to > trigger aborts and so flush lsn matching is not needed in IO > completion for processing freed inodes. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > Reviewed-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> > --- > fs/xfs/xfs_inode.c | 158 ++++++++++++++++++--------------------------- > 1 file changed, 62 insertions(+), 96 deletions(-) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 272b54cf97000..fb4c614c64fda 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c ... > @@ -2559,43 +2563,53 @@ xfs_ifree_get_one_inode( > */ > if (ip != free_ip) { > if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) { > + spin_unlock(&ip->i_flags_lock); > rcu_read_unlock(); > delay(1); > goto retry; > } > - > - /* > - * Check the inode number again in case we're racing with > - * freeing in xfs_reclaim_inode(). See the comments in that > - * function for more information as to why the initial check is > - * not sufficient. > - */ > - if (ip->i_ino != inum) { > - xfs_iunlock(ip, XFS_ILOCK_EXCL); > - goto out_rcu_unlock; > - } Why is the recheck under ILOCK_EXCL no longer necessary? It looks like reclaim decides whether to proceed or not under the ilock and doesn't acquire the spinlock until it decides to reclaim. Hm? > } > + ip->i_flags |= XFS_ISTALE; > + spin_unlock(&ip->i_flags_lock); > rcu_read_unlock(); > > - xfs_iflock(ip); > - xfs_iflags_set(ip, XFS_ISTALE); > + /* > + * If we can't get the flush lock, the inode is already attached. All > + * we needed to do here is mark the inode stale so buffer IO completion > + * will remove it from the AIL. > + */ To make sure I'm following this correctly, we can assume the inode is attached based on an iflock_nowait() failure because we hold the ilock, right? IOW, any other task doing a similar iflock check would have to do so under ilock and release the flush lock first if the inode didn't end up flushed, for whatever reason. > + iip = ip->i_itemp; > + if (!xfs_iflock_nowait(ip)) { > + ASSERT(!list_empty(&iip->ili_item.li_bio_list)); > + ASSERT(iip->ili_last_fields); > + goto out_iunlock; > + } > + ASSERT(!iip || list_empty(&iip->ili_item.li_bio_list)); > > /* > - * We don't need to attach clean inodes or those only with unlogged > - * changes (which we throw away, anyway). > + * Clean inodes can be released immediately. Everything else has to go > + * through xfs_iflush_abort() on journal commit as the flock > + * synchronises removal of the inode from the cluster buffer against > + * inode reclaim. > */ > - if (!ip->i_itemp || xfs_inode_clean(ip)) { > - ASSERT(ip != free_ip); > + if (xfs_inode_clean(ip)) { > xfs_ifunlock(ip); > - xfs_iunlock(ip, XFS_ILOCK_EXCL); > - goto out_no_inode; > + goto out_iunlock; > } > - return ip; > > -out_rcu_unlock: > - rcu_read_unlock(); > -out_no_inode: > - return NULL; > + /* we have a dirty inode in memory that has not yet been flushed. */ > + ASSERT(iip->ili_fields); > + spin_lock(&iip->ili_lock); > + iip->ili_last_fields = iip->ili_fields; > + iip->ili_fields = 0; > + iip->ili_fsync_fields = 0; > + spin_unlock(&iip->ili_lock); > + list_add_tail(&iip->ili_item.li_bio_list, &bp->b_li_list); > + ASSERT(iip->ili_last_fields); We already asserted ->ili_fields and assigned ->ili_fields to ->ili_last_fields, so this assert seems spurious. Brian > + > +out_iunlock: > + if (ip != free_ip) > + xfs_iunlock(ip, XFS_ILOCK_EXCL); > } > > /* > @@ -2605,26 +2619,20 @@ xfs_ifree_get_one_inode( > */ > STATIC int > xfs_ifree_cluster( > - xfs_inode_t *free_ip, > - xfs_trans_t *tp, > + struct xfs_inode *free_ip, > + struct xfs_trans *tp, > struct xfs_icluster *xic) > { > - xfs_mount_t *mp = free_ip->i_mount; > + struct xfs_mount *mp = free_ip->i_mount; > + struct xfs_ino_geometry *igeo = M_IGEO(mp); > + struct xfs_buf *bp; > + xfs_daddr_t blkno; > + xfs_ino_t inum = xic->first_ino; > int nbufs; > int i, j; > int ioffset; > - xfs_daddr_t blkno; > - xfs_buf_t *bp; > - xfs_inode_t *ip; > - struct xfs_inode_log_item *iip; > - struct xfs_log_item *lip; > - struct xfs_perag *pag; > - struct xfs_ino_geometry *igeo = M_IGEO(mp); > - xfs_ino_t inum; > int error; > > - inum = xic->first_ino; > - pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); > nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster; > > for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) { > @@ -2668,59 +2676,16 @@ xfs_ifree_cluster( > bp->b_ops = &xfs_inode_buf_ops; > > /* > - * Walk the inodes already attached to the buffer and mark them > - * stale. These will all have the flush locks held, so an > - * in-memory inode walk can't lock them. By marking them all > - * stale first, we will not attempt to lock them in the loop > - * below as the XFS_ISTALE flag will be set. > - */ > - list_for_each_entry(lip, &bp->b_li_list, li_bio_list) { > - if (lip->li_type == XFS_LI_INODE) { > - iip = (struct xfs_inode_log_item *)lip; > - xfs_trans_ail_copy_lsn(mp->m_ail, > - &iip->ili_flush_lsn, > - &iip->ili_item.li_lsn); > - xfs_iflags_set(iip->ili_inode, XFS_ISTALE); > - } > - } > - > - > - /* > - * For each inode in memory attempt to add it to the inode > - * buffer and set it up for being staled on buffer IO > - * completion. This is safe as we've locked out tail pushing > - * and flushing by locking the buffer. > - * > - * We have already marked every inode that was part of a > - * transaction stale above, which means there is no point in > - * even trying to lock them. > + * Now we need to set all the cached clean inodes as XFS_ISTALE, > + * too. This requires lookups, and will skip inodes that we've > + * already marked XFS_ISTALE. > */ > - for (i = 0; i < igeo->inodes_per_cluster; i++) { > - ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i); > - if (!ip) > - continue; > - > - iip = ip->i_itemp; > - spin_lock(&iip->ili_lock); > - iip->ili_last_fields = iip->ili_fields; > - iip->ili_fields = 0; > - iip->ili_fsync_fields = 0; > - spin_unlock(&iip->ili_lock); > - xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn, > - &iip->ili_item.li_lsn); > - > - list_add_tail(&iip->ili_item.li_bio_list, > - &bp->b_li_list); > - > - if (ip != free_ip) > - xfs_iunlock(ip, XFS_ILOCK_EXCL); > - } > + for (i = 0; i < igeo->inodes_per_cluster; i++) > + xfs_ifree_mark_inode_stale(bp, free_ip, inum + i); > > xfs_trans_stale_inode_buf(tp, bp); > xfs_trans_binval(tp, bp); > } > - > - xfs_perag_put(pag); > return 0; > } > > @@ -3845,6 +3810,7 @@ xfs_iflush_int( > iip->ili_fields = 0; > iip->ili_fsync_fields = 0; > spin_unlock(&iip->ili_lock); > + ASSERT(iip->ili_last_fields); > > /* > * Store the current LSN of the inode so that we can tell whether the > -- > 2.26.2.761.g0e0b3e54be >