On Tue, Jul 07, 2020 at 09:57:41PM +0800, Gao Xiang wrote: > Currently, we use AGI buffer lock to protect in-memory linked list for > unlinked inodes but since it's not necessary to modify AGI unless the > head of the unlinked list is modified. So let's removing the AGI buffer > modification dependency if possible, including 1) adding another per-AG > dedicated lock to protect the whole list and 2) inserting unlinked > inodes from tail. > > For 2), the tail of bucket 0 is now recorded in perag for xfs_iunlink() > to use. xfs_iunlink_remove() still support old multiple short bucket > lists for recovery code. > > Note that some paths take AGI lock in its transaction in advance, > so the proper locking order is only AGI lock -> unlinked list lock. How much agi buffer lock contention does this eliminate? > Signed-off-by: Gao Xiang <hsiangkao@xxxxxxxxxx> > --- > fs/xfs/xfs_inode.c | 251 ++++++++++++++++++++------------------- > fs/xfs/xfs_log_recover.c | 6 + > fs/xfs/xfs_mount.c | 3 + > fs/xfs/xfs_mount.h | 3 + > 4 files changed, 144 insertions(+), 119 deletions(-) > > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 10565fa5ace4..d33e5b198534 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -1994,182 +1994,195 @@ xfs_iunlink_update_bucket( > } > > /* > - * Always insert at the head, so we only have to do a next inode lookup to > - * update it's prev pointer. The AGI bucket will point at the one we are > - * inserting. > + * This is called when the inode's link count has gone to 0 or we are creating > + * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. > + * > + * We place the on-disk inode on a list in the AGI. It will be pulled from this > + * list when the inode is freed. > */ > -static int > -xfs_iunlink_insert_inode( > +STATIC int > +xfs_iunlink( > struct xfs_trans *tp, > - struct xfs_buf *agibp, > struct xfs_inode *ip) > { > struct xfs_mount *mp = tp->t_mountp; > - struct xfs_agi *agi = agibp->b_addr; > - xfs_agino_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > + struct xfs_perag *pag; > + struct xfs_inode *pip; > xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); > - xfs_agino_t next_agino; > + int error; > + > + ASSERT(VFS_I(ip)->i_nlink == 0); > + ASSERT(VFS_I(ip)->i_mode != 0); > + trace_xfs_iunlink(ip); > + > + pag = xfs_perag_get(mp, agno); > > /* > - * Get the index into the agi hash table for the list this inode will > - * go on. Make sure the pointer isn't garbage and that this inode > - * isn't already on the list. > + * some paths (e.g. xfs_create_tmpfile) could take AGI lock > + * in this transaction in advance and the only locking order > + * AGI buf lock -> pag_unlinked_mutex is safe. > */ > - next_agino = be32_to_cpu(agi->agi_unlinked[0]); > - if (next_agino == agino || > - !xfs_verify_agino_or_null(mp, agno, next_agino)) { > - xfs_buf_mark_corrupt(agibp); > - return -EFSCORRUPTED; > - } > - > - ip->i_prev_unlinked = NULLAGINO; > - ip->i_next_unlinked = next_agino; > - if (ip->i_next_unlinked != NULLAGINO) { > - struct xfs_inode *nip; > + mutex_lock(&pag->pag_unlinked_mutex); > + pip = pag->pag_unlinked_tail; > + if (!pip) { > + struct xfs_buf *agibp; > > - nip = xfs_iunlink_lookup_next(agibp->b_pag, ip); > - if (IS_ERR_OR_NULL(nip)) > - return -EFSCORRUPTED; > + mutex_unlock(&pag->pag_unlinked_mutex); > > - if (nip->i_prev_unlinked != NULLAGINO) { > - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, > - NULL, 0, __this_address); > - return -EFSCORRUPTED; > + /* > + * there could be some race, but it doesn't matter though > + * since !pip doesn't happen frequently. > + */ > + error = xfs_read_agi(mp, tp, agno, &agibp); > + if (error) { > + xfs_perag_put(pag); > + return error; > } > - nip->i_prev_unlinked = agino; > > - /* update the on disk inode now */ > - xfs_iunlink_log(tp, ip); > - } > + mutex_lock(&pag->pag_unlinked_mutex); > + pip = pag->pag_unlinked_tail; > + if (!pip) { > + struct xfs_agi *agi; > + > + ip->i_prev_unlinked = NULLAGINO; > > - /* Point the head of the list to point to this inode. */ > - return xfs_iunlink_update_bucket(tp, agno, agibp, next_agino, agino); > + agi = agibp->b_addr; > + ASSERT(be32_to_cpu(agi->agi_unlinked[0]) == NULLAGINO); > + > + /* Point the head of the list to point to this inode. */ > + error = xfs_iunlink_update_bucket(tp, agno, > + agibp, NULLAGINO, agino); > + goto out; > + } > + } > + ip->i_prev_unlinked = XFS_INO_TO_AGINO(mp, pip->i_ino); > + ASSERT(pip->i_next_unlinked == NULLAGINO); > + pip->i_next_unlinked = agino; > + xfs_iunlink_log(tp, pip); > +out: > + ip->i_next_unlinked = NULLAGINO; > + pag->pag_unlinked_tail = ip; > + mutex_unlock(&pag->pag_unlinked_mutex); > + xfs_perag_put(pag); > + return 0; > } > > /* > + * Pull the on-disk inode from the AGI unlinked list. > + * > * Remove can be from anywhere in the list, so we have to do two adjacent inode > * lookups here so we can update list pointers. We may be at the head or the > * tail of the list, so we have to handle those cases as well. > */ > -static int > -xfs_iunlink_remove_inode( > +STATIC int > +xfs_iunlink_remove( > struct xfs_trans *tp, > - struct xfs_buf *agibp, > struct xfs_inode *ip) > { > struct xfs_mount *mp = tp->t_mountp; > - xfs_agino_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > + struct xfs_perag *pag; > + struct xfs_inode *pip = NULL; > xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); > - xfs_agino_t next_agino = ip->i_next_unlinked; > + xfs_agino_t next_agino; > int error; > > + trace_xfs_iunlink_remove(ip); > + > + pag = xfs_perag_get(mp, agno); > + mutex_lock(&pag->pag_unlinked_mutex); > + > + /* see the comment in xfs_iunlink() on the only proper locking order */ > if (ip->i_prev_unlinked == NULLAGINO) { > - /* remove from head of list */ > - if (next_agino == agino || > - !xfs_verify_agino_or_null(mp, agno, next_agino)) > - return -EFSCORRUPTED; > + struct xfs_buf *agibp; > > - error = xfs_iunlink_update_bucket(tp, agno, agibp, agino, next_agino); > - if (error) > + mutex_unlock(&pag->pag_unlinked_mutex); > + > + error = xfs_read_agi(mp, tp, agno, &agibp); > + if (error) { > + xfs_perag_put(pag); > return error; > - } else { > - /* lookup previous inode and update to point at next */ > - struct xfs_inode *pip; > + } > > - pip = xfs_iunlink_lookup_prev(agibp->b_pag, ip); > - if (IS_ERR_OR_NULL(pip)) > - return -EFSCORRUPTED; > + mutex_lock(&pag->pag_unlinked_mutex); > + if (ip->i_prev_unlinked == NULLAGINO) { > + struct xfs_agi *agi; > > - if (pip->i_next_unlinked != agino) { > - xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, > - NULL, 0, __this_address); > - return -EFSCORRUPTED; > + next_agino = ip->i_next_unlinked; > + agi = agibp->b_addr; > + > + /* remove from head of list */ > + if (next_agino == agino || > + !xfs_verify_agino_or_null(mp, agno, next_agino)) > + goto err_fscorrupted; > + > + error = xfs_iunlink_update_bucket(tp, agno, > + agibp, agino, next_agino); > + if (error) > + goto err_out; > + > + goto next_fixup; > } > + } > > - /* update the on disk inode now */ > - pip->i_next_unlinked = next_agino; > - xfs_iunlink_log(tp, pip); > + next_agino = ip->i_next_unlinked; > + > + /* lookup previous inode and update to point at next */ > + pip = xfs_iunlink_lookup_prev(pag, ip); > + if (IS_ERR_OR_NULL(pip)) > + goto err_fscorrupted; > + > + if (pip->i_next_unlinked != agino) { > + xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, > + NULL, 0, __this_address); > + goto err_fscorrupted; > } > + pip->i_next_unlinked = next_agino; > + xfs_iunlink_log(tp, pip); > > - /* lookup the next inode and update to point at prev */ > - if (ip->i_next_unlinked != NULLAGINO) { > +next_fixup: > + if (next_agino == NULLAGINO) { > + /* so iunlink recovery can work here */ > + if (pag->pag_unlinked_tail == ip) > + pag->pag_unlinked_tail = pip; > + } else { > + /* lookup the next inode and update to point at prev */ > struct xfs_inode *nip; > > - nip = xfs_iunlink_lookup_next(agibp->b_pag, ip); > + nip = xfs_iunlink_lookup_next(pag, ip); > if (IS_ERR_OR_NULL(nip)) > - return -EFSCORRUPTED; > + goto err_fscorrupted; > > if (nip->i_prev_unlinked != agino) { > xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, > NULL, 0, __this_address); > - return -EFSCORRUPTED; > + goto err_fscorrupted; > } > /* in memory update only */ > nip->i_prev_unlinked = ip->i_prev_unlinked; > } > + mutex_unlock(&pag->pag_unlinked_mutex); > + xfs_perag_put(pag); > > + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); > /* > * Now clear prev/next from this inode and update on disk if we > * need to clear the on-disk link. > */ > ip->i_prev_unlinked = NULLAGINO; > - ip->i_next_unlinked = NULLAGINO; > - if (next_agino != NULLAGINO) > + if (next_agino != NULLAGINO) { > + ip->i_next_unlinked = NULLAGINO; > xfs_iunlink_log(tp, ip); > + } > return 0; > -} > > -/* > - * This is called when the inode's link count has gone to 0 or we are creating > - * a tmpfile via O_TMPFILE. The inode @ip must have nlink == 0. > - * > - * We place the on-disk inode on a list in the AGI. It will be pulled from this > - * list when the inode is freed. > - */ > -STATIC int > -xfs_iunlink( > - struct xfs_trans *tp, > - struct xfs_inode *ip) > -{ > - struct xfs_mount *mp = tp->t_mountp; > - struct xfs_buf *agibp; > - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > - int error; > - > - ASSERT(ip->i_next_unlinked == NULLAGINO); > - ASSERT(VFS_I(ip)->i_nlink == 0); > - ASSERT(VFS_I(ip)->i_mode != 0); > - trace_xfs_iunlink(ip); > - > - /* Get the agi buffer first. It ensures lock ordering on the list. */ > - error = xfs_read_agi(mp, tp, agno, &agibp); > - if (error) > - return error; > - > - return xfs_iunlink_insert_inode(tp, agibp, ip); > -} > - > -/* > - * Pull the on-disk inode from the AGI unlinked list. > - */ > -STATIC int > -xfs_iunlink_remove( > - struct xfs_trans *tp, > - struct xfs_inode *ip) > -{ > - struct xfs_mount *mp = tp->t_mountp; > - struct xfs_buf *agibp; > - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > - int error; > - > - trace_xfs_iunlink_remove(ip); > - > - /* Get the agi buffer first. It ensures lock ordering on the list. */ > - error = xfs_read_agi(mp, tp, agno, &agibp); > - if (error) > - return error; > - > - return xfs_iunlink_remove_inode(tp, agibp, ip); > +err_fscorrupted: > + error = -EFSCORRUPTED; > +err_out: > + mutex_unlock(&pag->pag_unlinked_mutex); > + xfs_perag_put(pag); > + return error; > } > > /* > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index d47eea31c165..3f2739316424 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -2801,6 +2801,11 @@ xlog_recover_iunlink_ag( > prev_ip->i_next_unlinked = NULLAGINO; > break; > } > + > + /* XXX: take pag_unlinked_mutex across the loop? */ > + if (!bucket) > + agibp->b_pag->pag_unlinked_tail = ip; > + > if (prev_ip) { > ip->i_prev_unlinked = prev_agino; > xfs_irele(prev_ip); > @@ -2812,6 +2817,7 @@ xlog_recover_iunlink_ag( > } > if (prev_ip) > xfs_irele(prev_ip); > + > if (error) { > /* > * We can't read an inode this bucket points to, or an > diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c > index 031e96ff022d..a9701f863e84 100644 > --- a/fs/xfs/xfs_mount.c > +++ b/fs/xfs/xfs_mount.c > @@ -223,6 +223,9 @@ xfs_initialize_perag( > if (first_initialised == NULLAGNUMBER) > first_initialised = index; > spin_lock_init(&pag->pag_state_lock); > + > + mutex_init(&pag->pag_unlinked_mutex); > + pag->pag_unlinked_tail = NULL; > } > > index = xfs_set_inode_alloc(mp, agcount); > diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h > index a72cfcaa4ad1..107a634a34f7 100644 > --- a/fs/xfs/xfs_mount.h > +++ b/fs/xfs/xfs_mount.h > @@ -372,6 +372,9 @@ typedef struct xfs_perag { > /* reference count */ > uint8_t pagf_refcount_level; > > + struct mutex pag_unlinked_mutex; > + struct xfs_inode *pag_unlinked_tail; What do these fields do? There aren't any comments.... --D > + > /* > * Unlinked inode information. This incore information reflects > * data stored in the AGI, so callers must hold the AGI buffer lock > -- > 2.18.1 >