On Tue, Jun 23, 2020 at 07:50:15PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Tracking dirty inodes via cluster buffers creates lock ordering > issues with logging unlinked inode updates direct to the cluster > buffer. The unlinked inode list is unordered, so we can lock cluster > buffers in random orders and that causes deadlocks. > > To solve this problem, we really want to dealy locking the cluster "delay" > buffers until the pre-commit phase where we can order the buffers > correctly along with all the other inode cluster buffers that are > locked by the transaction. However, to do this we need to be able to > tell the transaction which inodes need to have there unlinked list "their" > updated and what it should be updated to. > > We can delay the buffer update to the pre-commit phase based on the > fact taht all unlinked inode list updates are serialised by the AGI "that" > buffer. It will be locked into the transaction before the list > update starts, and will remain locked until the transaction commits. > Hence we can lock and update the cluster buffers safely any time > during the transaction and we are still safe from other racing > unlinked list updates. > > The iunlink log item currently only exists in memory. we need a log > item to attach information to the transaction, but it's context "its" > is completely owned by the transaction. Hence it is never formatted > or inserted into the CIL, nor is it seen by the journal, the AIL or > log recovery. > > This makes it a very simple log item, and the changes makes results > in adding addition buffer log items to the transaction. Hence once > the iunlink log item has run it's pre-commit operation, it can be "its" > dropped by the transaction and released. > > The creation of this in-memory intent does not prevent us from > extending it in future to the journal to replace buffer based > logging of the unlinked list. Changing the format of the items we > write to the on disk journal is beyond the scope of this patchset, > hence we limit it to being in-memory only. <nod> Looks pretty straightforward... > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 + > fs/xfs/xfs_inode.c | 70 +++---------------- > fs/xfs/xfs_inode_item.c | 3 +- > fs/xfs/xfs_iunlink_item.c | 141 ++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_iunlink_item.h | 24 +++++++ > fs/xfs/xfs_super.c | 10 +++ > 6 files changed, 189 insertions(+), 60 deletions(-) > create mode 100644 fs/xfs/xfs_iunlink_item.c > create mode 100644 fs/xfs/xfs_iunlink_item.h > > diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile > index 04611a1068b4..febdf034ca94 100644 > --- a/fs/xfs/Makefile > +++ b/fs/xfs/Makefile > @@ -105,6 +105,7 @@ xfs-y += xfs_log.o \ > xfs_icreate_item.o \ > xfs_inode_item.o \ > xfs_inode_item_recover.o \ > + xfs_iunlink_item.o \ > xfs_refcount_item.o \ > xfs_rmap_item.o \ > xfs_log_recover.o \ > diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c > index 1f1c8819330b..ab288424764c 100644 > --- a/fs/xfs/xfs_inode.c > +++ b/fs/xfs/xfs_inode.c > @@ -35,6 +35,7 @@ > #include "xfs_log.h" > #include "xfs_bmap_btree.h" > #include "xfs_reflink.h" > +#include "xfs_iunlink_item.h" > > kmem_zone_t *xfs_inode_zone; > > @@ -1945,56 +1946,6 @@ xfs_iunlink_lookup_next( > return next_ip; > } > > -/* Set an on-disk inode's next_unlinked pointer. */ > -STATIC void > -xfs_iunlink_update_dinode( > - struct xfs_trans *tp, > - xfs_agnumber_t agno, > - xfs_agino_t agino, > - struct xfs_buf *ibp, > - struct xfs_dinode *dip, > - struct xfs_imap *imap, > - xfs_agino_t next_agino) > -{ > - struct xfs_mount *mp = tp->t_mountp; > - int offset; > - > - ASSERT(xfs_verify_agino_or_null(mp, agno, next_agino)); > - > - trace_xfs_iunlink_update_dinode(mp, agno, agino, > - be32_to_cpu(dip->di_next_unlinked), next_agino); > - > - dip->di_next_unlinked = cpu_to_be32(next_agino); > - offset = imap->im_boffset + > - offsetof(struct xfs_dinode, di_next_unlinked); > - > - /* need to recalc the inode CRC if appropriate */ > - xfs_dinode_calc_crc(mp, dip); > - xfs_trans_inode_buf(tp, ibp); > - xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1); > -} > - > -/* Set an in-core inode's unlinked pointer and return the old value. */ > -static int > -xfs_iunlink_update_inode( > - struct xfs_trans *tp, > - struct xfs_inode *ip) > -{ > - struct xfs_mount *mp = tp->t_mountp; > - xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > - struct xfs_dinode *dip; > - struct xfs_buf *ibp; > - int error; > - > - error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp, 0); > - if (error) > - return error; > - > - xfs_iunlink_update_dinode(tp, agno, XFS_INO_TO_AGINO(mp, ip->i_ino), > - ibp, dip, &ip->i_imap, ip->i_next_unlinked); > - return 0; > -} > - > /* > * Point the AGI unlinked bucket at an inode and log the results. The caller > * is responsible for validating the old value. > @@ -2051,7 +2002,6 @@ xfs_iunlink_insert_inode( > xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino); > xfs_agino_t next_agino; > short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS; > - int error; > > /* > * Get the index into the agi hash table for the list this inode will > @@ -2082,9 +2032,7 @@ xfs_iunlink_insert_inode( > nip->i_prev_unlinked = agino; > > /* update the on disk inode now */ > - error = xfs_iunlink_update_inode(tp, ip); > - if (error) > - return error; > + xfs_iunlink_log(tp, ip); > } > > /* Point the head of the list to point to this inode. */ > @@ -2140,9 +2088,7 @@ xfs_iunlink_remove_inode( > > /* update the on disk inode now */ > pip->i_next_unlinked = next_agino; > - error = xfs_iunlink_update_inode(tp, pip); > - if (error) > - return error; > + xfs_iunlink_log(tp, pip); > } > > /* lookup the next inode and update to point at prev */ > @@ -2162,10 +2108,15 @@ xfs_iunlink_remove_inode( > nip->i_prev_unlinked = ip->i_prev_unlinked; > } > > - /* now clear prev/next from this inode and update on disk */ > + /* > + * Now clear prev/next from this inode and update on disk if we > + * need to clear the on-disk link. > + */ > ip->i_prev_unlinked = NULLAGINO; > ip->i_next_unlinked = NULLAGINO; > - return xfs_iunlink_update_inode(tp, ip); > + if (next_agino != NULLAGINO) > + xfs_iunlink_log(tp, ip); > + return 0; > } > > /* > @@ -2185,6 +2136,7 @@ xfs_iunlink( > xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino); > int error; > > + ASSERT(ip->i_next_unlinked == NULLAGINO); > ASSERT(VFS_I(ip)->i_nlink == 0); > ASSERT(VFS_I(ip)->i_mode != 0); > trace_xfs_iunlink(ip); > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c > index 0494b907c63d..bc1970c37edc 100644 > --- a/fs/xfs/xfs_inode_item.c > +++ b/fs/xfs/xfs_inode_item.c > @@ -488,8 +488,9 @@ xfs_inode_item_push( > ASSERT(iip->ili_item.li_buf); > > if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || > - (ip->i_flags & XFS_ISTALE)) > + (ip->i_flags & XFS_ISTALE)) { > return XFS_ITEM_PINNED; > + } > > /* If the inode is already flush locked, we're already flushing. */ > if (xfs_iflags_test(ip, XFS_IFLUSHING)) > diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c > new file mode 100644 > index 000000000000..83f1dc81133b > --- /dev/null > +++ b/fs/xfs/xfs_iunlink_item.c > @@ -0,0 +1,141 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (c) 2020, Red Hat, Inc. > + * All Rights Reserved. > + */ > +#include "xfs.h" > +#include "xfs_fs.h" > +#include "xfs_shared.h" > +#include "xfs_format.h" > +#include "xfs_log_format.h" > +#include "xfs_trans_resv.h" > +#include "xfs_mount.h" > +#include "xfs_inode.h" > +#include "xfs_trans.h" > +#include "xfs_trans_priv.h" > +#include "xfs_iunlink_item.h" > +#include "xfs_trace.h" > + > +kmem_zone_t *xfs_iunlink_zone; > + > +static inline struct xfs_iunlink_item *IUL_ITEM(struct xfs_log_item *lip) > +{ > + return container_of(lip, struct xfs_iunlink_item, iu_item); > +} > + > +static void > +xfs_iunlink_item_release( > + struct xfs_log_item *lip) > +{ > + kmem_cache_free(xfs_iunlink_zone, IUL_ITEM(lip)); > +} > + > + > +static uint64_t > +xfs_iunlink_item_sort( > + struct xfs_log_item *lip) > +{ > + return IUL_ITEM(lip)->iu_ino; > +} Oh, I see, ->iop_sort is supposed to return a sorting key for each log item so that we can reorder the iunlink items to take locks in the correct order. > + > +/* > + * On precommit, we grab the inode cluster buffer for the inode number > + * we were passed, then update the next unlinked field for that inode in > + * the buffer and log the buffer. This ensures that the inode cluster buffer > + * was logged in the correct order w.r.t. other inode cluster buffers. > + * > + * Note: if the inode cluster buffer is marked stale, this transaction is > + * actually freeing the inode cluster. In that case, do not relog the buffer > + * as this removes the stale state from it. That then causes the post-commit > + * processing that is dependent on the cluster buffer being stale to go wrong > + * and we'll leave stale inodes in the AIL that cannot be removed, hanging the > + * log. > + */ > +static int > +xfs_iunlink_item_precommit( > + struct xfs_trans *tp, > + struct xfs_log_item *lip) > +{ > + struct xfs_mount *mp = tp->t_mountp; > + struct xfs_iunlink_item *iup = IUL_ITEM(lip); > + xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, iup->iu_ino); > + xfs_agino_t agino = XFS_INO_TO_AGINO(mp, iup->iu_ino); > + struct xfs_dinode *dip; > + struct xfs_buf *bp; > + int offset; > + int error; > + > + error = xfs_imap_to_bp(mp, tp, &iup->iu_imap, &dip, &bp, 0); > + if (error) > + goto out_remove; > + > + trace_xfs_iunlink_update_dinode(mp, agno, agino, > + be32_to_cpu(dip->di_next_unlinked), > + iup->iu_next_unlinked); > + > + /* > + * Don't bother updating the unlinked field on stale buffers as > + * it will never get to disk anyway. > + */ > + if (bp->b_flags & XBF_STALE) > + goto out_remove; > + > + dip->di_next_unlinked = cpu_to_be32(iup->iu_next_unlinked); > + offset = iup->iu_imap.im_boffset + > + offsetof(struct xfs_dinode, di_next_unlinked); > + > + /* need to recalc the inode CRC if appropriate */ > + xfs_dinode_calc_crc(mp, dip); > + xfs_trans_inode_buf(tp, bp); > + xfs_trans_log_buf(tp, bp, offset, offset + sizeof(xfs_agino_t) - 1); > + > +out_remove: > + /* > + * This log item only exists to perform this action. We now remove > + * it from the transaction and free it as it should never reach the > + * CIL. > + */ > + list_del(&lip->li_trans); > + xfs_iunlink_item_release(lip); > + return error; > +} > + > +static const struct xfs_item_ops xfs_iunlink_item_ops = { > + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, > + .iop_release = xfs_iunlink_item_release, > + .iop_sort = xfs_iunlink_item_sort, > + .iop_precommit = xfs_iunlink_item_precommit, > +}; > + > + Hm, ok, I see how these pieces fit together. This looks like a clever combination of the incore inode prev_unlinked field and log items, and I agree that it's less messy than the backref hashtable. > +/* > + * Initialize the inode log item for a newly allocated (in-core) inode. > + * > + * Inode extents can only reside within an AG. Hence specify the starting > + * block for the inode chunk by offset within an AG as well as the > + * length of the allocated extent. > + * > + * This joins the item to the transaction and marks it dirty so > + * that we don't need a separate call to do this, nor does the > + * caller need to know anything about the iunlink item. > + */ > +void > +xfs_iunlink_log( > + struct xfs_trans *tp, > + struct xfs_inode *ip) > +{ > + struct xfs_iunlink_item *iup; > + > + iup = kmem_zone_zalloc(xfs_iunlink_zone, 0); > + > + xfs_log_item_init(tp->t_mountp, &iup->iu_item, XFS_LI_IUNLINK, > + &xfs_iunlink_item_ops); > + > + iup->iu_ino = ip->i_ino; > + iup->iu_next_unlinked = ip->i_next_unlinked; > + iup->iu_imap = ip->i_imap; > + > + xfs_trans_add_item(tp, &iup->iu_item); > + tp->t_flags |= XFS_TRANS_DIRTY; > + set_bit(XFS_LI_DIRTY, &iup->iu_item.li_flags); > +} > diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h > new file mode 100644 > index 000000000000..c9e58acf4ccf > --- /dev/null > +++ b/fs/xfs/xfs_iunlink_item.h > @@ -0,0 +1,24 @@ > +// SPDX-License-Identifier: GPL-2.0 Nitpick: Insane SPDX rules say that this should be a C-style comment because it's a header file (whereas C++ style comments are fine for C source files because IDFGI. --D > +/* > + * Copyright (c) 2020, Red Hat, Inc. > + * All Rights Reserved. > + */ > +#ifndef XFS_IUNLINK_ITEM_H > +#define XFS_IUNLINK_ITEM_H 1 > + > +struct xfs_trans; > +struct xfs_inode; > + > +/* in memory log item structure */ > +struct xfs_iunlink_item { > + struct xfs_log_item iu_item; > + struct xfs_imap iu_imap; > + xfs_ino_t iu_ino; > + xfs_agino_t iu_next_unlinked; > +}; > + > +extern kmem_zone_t *xfs_iunlink_zone; > + > +void xfs_iunlink_log(struct xfs_trans *tp, struct xfs_inode *ip); > + > +#endif /* XFS_IUNLINK_ITEM_H */ > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 5a5d9453cf51..a36dfb0e7e5b 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -35,6 +35,7 @@ > #include "xfs_refcount_item.h" > #include "xfs_bmap_item.h" > #include "xfs_reflink.h" > +#include "xfs_iunlink_item.h" > > #include <linux/magic.h> > #include <linux/fs_context.h> > @@ -1955,8 +1956,16 @@ xfs_init_zones(void) > if (!xfs_bui_zone) > goto out_destroy_bud_zone; > > + xfs_iunlink_zone = kmem_cache_create("xfs_iul_item", > + sizeof(struct xfs_iunlink_item), > + 0, 0, NULL); > + if (!xfs_iunlink_zone) > + goto out_destroy_bui_zone; > + > return 0; > > + out_destroy_bui_zone: > + kmem_cache_destroy(xfs_bui_zone); > out_destroy_bud_zone: > kmem_cache_destroy(xfs_bud_zone); > out_destroy_cui_zone: > @@ -2003,6 +2012,7 @@ xfs_destroy_zones(void) > * destroy caches. > */ > rcu_barrier(); > + kmem_cache_destroy(xfs_iunlink_zone); > kmem_cache_destroy(xfs_bui_zone); > kmem_cache_destroy(xfs_bud_zone); > kmem_cache_destroy(xfs_cui_zone); > -- > 2.26.2.761.g0e0b3e54be >