Currently, we use AGI buffer lock to protect in-memory linked list for unlinked inodes but since it's not necessary to modify AGI unless the head of the unlinked list is modified. Therefore, let's add another per-AG dedicated lock to protect the whole list, and removing AGI buffer lock in xfs_iunlink_remove() if the head is untouched. Note that some paths (e.g. xfs_create_tmpfile()) take AGI lock in its transaction in advance, so the proper locking order is only AGI lock -> unlinked list lock. Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxx> --- fs/xfs/xfs_inode.c | 79 ++++++++++++++++++++++++++++++++++++++-------- fs/xfs/xfs_mount.c | 1 + fs/xfs/xfs_mount.h | 3 ++ 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 4994398373df..d78aaa8ce252 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2039,6 +2039,46 @@ xfs_iunlink_insert_inode( return xfs_iunlink_update_bucket(tp, agno, agibp, 0, agino); } +/* + * Lock the perag and take AGI lock if agi_unlinked is touched as well for + * xfs_iunlink_remove_inode(). + * + * Inode allocation in the O_TMPFILE path defines the AGI/unlinked list lock + * order as being AGI->perag unlinked list lock. We are inverting it here as + * the fast path tail addition does not need to modify the AGI at all. Hence + * we only need the AGI lock if the tail is empty, correct lock order. + */ +static struct xfs_perag * +xfs_iunlink_remove_lock( + xfs_agino_t agno, + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_buf **agibpp) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_perag *pag; + int error; + + pag = xfs_perag_get(mp, agno); + mutex_lock(&pag->pag_unlinked_mutex); + + if (ip->i_prev_unlinked == NULLAGINO) { + mutex_unlock(&pag->pag_unlinked_mutex); + /* + * some paths (e.g. xfs_create_tmpfile) could take AGI lock + * in this transaction in advance and the only locking order + * AGI buf lock -> pag_unlinked_mutex is safe. + */ + error = xfs_read_agi(mp, tp, agno, agibpp); + if (error) { + xfs_perag_put(pag); + return ERR_PTR(error); + } + mutex_lock(&pag->pag_unlinked_mutex); + } + return pag; +} + /* * Remove can be from anywhere in the list, so we have to do two adjacent inode * lookups here so we can update list pointers. We may be at the head or the @@ -2046,12 +2086,12 @@ xfs_iunlink_insert_inode( */ static int xfs_iunlink_remove_inode( + struct xfs_perag *pag, struct xfs_trans *tp, struct xfs_buf *agibp, struct xfs_inode *ip) { struct xfs_mount *mp = tp->t_mountp; - struct xfs_agi *agi = agibp->b_addr; xfs_agino_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); xfs_agino_t next_agino = ip->i_next_unlinked; @@ -2059,6 +2099,7 @@ xfs_iunlink_remove_inode( if (ip->i_prev_unlinked == NULLAGINO) { struct xlog *log = mp->m_log; + struct xfs_agi *agi = agibp->b_addr; short bucket_index = 0; /* During recovery, the old multiple bucket can be applied */ @@ -2083,7 +2124,7 @@ xfs_iunlink_remove_inode( /* lookup previous inode and update to point at next */ struct xfs_inode *pip; - pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip); + pip = xfs_iunlink_lookup_prev(pag, ip); if (IS_ERR_OR_NULL(pip)) return -EFSCORRUPTED; @@ -2102,7 +2143,7 @@ xfs_iunlink_remove_inode( if (ip->i_next_unlinked != NULLAGINO) { struct xfs_inode *nip; - nip = xfs_iunlink_lookup_next(agibp->b_pag, ip); + nip = xfs_iunlink_lookup_next(pag, ip); if (IS_ERR_OR_NULL(nip)) return -EFSCORRUPTED; @@ -2126,6 +2167,15 @@ xfs_iunlink_remove_inode( return 0; } +static void +xfs_iunlink_unlock( + struct xfs_perag *pag) +{ + /* Does not unlock AGI, ever. xfs_trans_commit() does that. */ + mutex_unlock(&pag->pag_unlinked_mutex); + xfs_perag_put(pag); +} + /* * This is called when the inode's link count has gone to 0 or we are creating * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. @@ -2143,7 +2193,6 @@ xfs_iunlink( xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); int error; - ASSERT(ip->i_next_unlinked == NULLAGINO); ASSERT(VFS_I(ip)->i_nlink == 0); ASSERT(VFS_I(ip)->i_mode != 0); trace_xfs_iunlink(ip); @@ -2153,7 +2202,10 @@ xfs_iunlink( if (error) return error; - return xfs_iunlink_insert_inode(tp, agibp, ip); + mutex_lock(&agibp->b_pag->pag_unlinked_mutex); + error = xfs_iunlink_insert_inode(tp, agibp, ip); + mutex_unlock(&agibp->b_pag->pag_unlinked_mutex); + return error; } /* @@ -2164,19 +2216,20 @@ xfs_iunlink_remove( struct xfs_trans *tp, struct xfs_inode *ip) { - struct xfs_mount *mp = tp->t_mountp; - struct xfs_buf *agibp; - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); + struct xfs_buf *agibp = NULL; + struct xfs_perag *pag; int error; trace_xfs_iunlink_remove(ip); - /* Get the agi buffer first. It ensures lock ordering on the list. */ - error = xfs_read_agi(mp, tp, agno, &agibp); - if (error) - return error; + pag = xfs_iunlink_remove_lock(XFS_INO_TO_AGNO(tp->t_mountp, ip->i_ino), + tp, ip, &agibp); + if (IS_ERR(pag)) + return PTR_ERR(pag); - return xfs_iunlink_remove_inode(tp, agibp, ip); + error = xfs_iunlink_remove_inode(pag, tp, agibp, ip); + xfs_iunlink_unlock(pag); + return error; } /* diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 031e96ff022d..f94e14059e61 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -223,6 +223,7 @@ xfs_initialize_perag( if (first_initialised == NULLAGNUMBER) first_initialised = index; spin_lock_init(&pag->pag_state_lock); + mutex_init(&pag->pag_unlinked_mutex); } index = xfs_set_inode_alloc(mp, agcount); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index a72cfcaa4ad1..9a1d0f239fa4 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -372,6 +372,9 @@ typedef struct xfs_perag { /* reference count */ uint8_t pagf_refcount_level; + /* lock to protect unlinked inode list */ + struct mutex pag_unlinked_mutex; + /* * Unlinked inode information. This incore information reflects * data stored in the AGI, so callers must hold the AGI buffer lock -- 2.18.1