On Tue, Jun 23, 2020 at 07:50:15PM +1000, Dave Chinner wrote: > From: Dave Chinner <dchinner@xxxxxxxxxx> > > Tracking dirty inodes via cluster buffers creates lock ordering > issues with logging unlinked inode updates direct to the cluster > buffer. The unlinked inode list is unordered, so we can lock cluster > buffers in random orders and that causes deadlocks. > > To solve this problem, we really want to dealy locking the cluster > buffers until the pre-commit phase where we can order the buffers > correctly along with all the other inode cluster buffers that are > locked by the transaction. However, to do this we need to be able to > tell the transaction which inodes need to have there unlinked list > updated and what it should be updated to. > > We can delay the buffer update to the pre-commit phase based on the > fact taht all unlinked inode list updates are serialised by the AGI > buffer. It will be locked into the transaction before the list > update starts, and will remain locked until the transaction commits. > Hence we can lock and update the cluster buffers safely any time > during the transaction and we are still safe from other racing > unlinked list updates. > > The iunlink log item currently only exists in memory. we need a log > item to attach information to the transaction, but it's context > is completely owned by the transaction. Hence it is never formatted > or inserted into the CIL, nor is it seen by the journal, the AIL or > log recovery. > > This makes it a very simple log item, and the changes makes results > in adding addition buffer log items to the transaction. Hence once > the iunlink log item has run it's pre-commit operation, it can be > dropped by the transaction and released. > > The creation of this in-memory intent does not prevent us from > extending it in future to the journal to replace buffer based > logging of the unlinked list. Changing the format of the items we > write to the on disk journal is beyond the scope of this patchset, > hence we limit it to being in-memory only. > > Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx> > --- > fs/xfs/Makefile | 1 + > fs/xfs/xfs_inode.c | 70 +++---------------- > fs/xfs/xfs_inode_item.c | 3 +- > fs/xfs/xfs_iunlink_item.c | 141 ++++++++++++++++++++++++++++++++++++++ > fs/xfs/xfs_iunlink_item.h | 24 +++++++ > fs/xfs/xfs_super.c | 10 +++ > 6 files changed, 189 insertions(+), 60 deletions(-) > create mode 100644 fs/xfs/xfs_iunlink_item.c > create mode 100644 fs/xfs/xfs_iunlink_item.h > ... > diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c > index 0494b907c63d..bc1970c37edc 100644 > --- a/fs/xfs/xfs_inode_item.c > +++ b/fs/xfs/xfs_inode_item.c > @@ -488,8 +488,9 @@ xfs_inode_item_push( > ASSERT(iip->ili_item.li_buf); > > if (xfs_ipincount(ip) > 0 || xfs_buf_ispinned(bp) || > - (ip->i_flags & XFS_ISTALE)) > + (ip->i_flags & XFS_ISTALE)) { > return XFS_ITEM_PINNED; > + } Spurious change..? > > /* If the inode is already flush locked, we're already flushing. */ > if (xfs_iflags_test(ip, XFS_IFLUSHING)) > diff --git a/fs/xfs/xfs_iunlink_item.c b/fs/xfs/xfs_iunlink_item.c > new file mode 100644 > index 000000000000..83f1dc81133b > --- /dev/null > +++ b/fs/xfs/xfs_iunlink_item.c > @@ -0,0 +1,141 @@ ... > + > +static const struct xfs_item_ops xfs_iunlink_item_ops = { > + .flags = XFS_ITEM_RELEASE_WHEN_COMMITTED, > + .iop_release = xfs_iunlink_item_release, Presumably we need the release callback for transaction abort, but the flag looks unnecessary. That triggers a release on commit to the on-disk log, which IIUC should never happen for this item. > + .iop_sort = xfs_iunlink_item_sort, > + .iop_precommit = xfs_iunlink_item_precommit, > +}; > + > + > +/* > + * Initialize the inode log item for a newly allocated (in-core) inode. > + * > + * Inode extents can only reside within an AG. Hence specify the starting > + * block for the inode chunk by offset within an AG as well as the > + * length of the allocated extent. > + * > + * This joins the item to the transaction and marks it dirty so > + * that we don't need a separate call to do this, nor does the > + * caller need to know anything about the iunlink item. > + */ Looks like some copy/paste remnants in the comment. Brian > +void > +xfs_iunlink_log( > + struct xfs_trans *tp, > + struct xfs_inode *ip) > +{ > + struct xfs_iunlink_item *iup; > + > + iup = kmem_zone_zalloc(xfs_iunlink_zone, 0); > + > + xfs_log_item_init(tp->t_mountp, &iup->iu_item, XFS_LI_IUNLINK, > + &xfs_iunlink_item_ops); > + > + iup->iu_ino = ip->i_ino; > + iup->iu_next_unlinked = ip->i_next_unlinked; > + iup->iu_imap = ip->i_imap; > + > + xfs_trans_add_item(tp, &iup->iu_item); > + tp->t_flags |= XFS_TRANS_DIRTY; > + set_bit(XFS_LI_DIRTY, &iup->iu_item.li_flags); > +} > diff --git a/fs/xfs/xfs_iunlink_item.h b/fs/xfs/xfs_iunlink_item.h > new file mode 100644 > index 000000000000..c9e58acf4ccf > --- /dev/null > +++ b/fs/xfs/xfs_iunlink_item.h > @@ -0,0 +1,24 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* > + * Copyright (c) 2020, Red Hat, Inc. > + * All Rights Reserved. > + */ > +#ifndef XFS_IUNLINK_ITEM_H > +#define XFS_IUNLINK_ITEM_H 1 > + > +struct xfs_trans; > +struct xfs_inode; > + > +/* in memory log item structure */ > +struct xfs_iunlink_item { > + struct xfs_log_item iu_item; > + struct xfs_imap iu_imap; > + xfs_ino_t iu_ino; > + xfs_agino_t iu_next_unlinked; > +}; > + > +extern kmem_zone_t *xfs_iunlink_zone; > + > +void xfs_iunlink_log(struct xfs_trans *tp, struct xfs_inode *ip); > + > +#endif /* XFS_IUNLINK_ITEM_H */ > diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c > index 5a5d9453cf51..a36dfb0e7e5b 100644 > --- a/fs/xfs/xfs_super.c > +++ b/fs/xfs/xfs_super.c > @@ -35,6 +35,7 @@ > #include "xfs_refcount_item.h" > #include "xfs_bmap_item.h" > #include "xfs_reflink.h" > +#include "xfs_iunlink_item.h" > > #include <linux/magic.h> > #include <linux/fs_context.h> > @@ -1955,8 +1956,16 @@ xfs_init_zones(void) > if (!xfs_bui_zone) > goto out_destroy_bud_zone; > > + xfs_iunlink_zone = kmem_cache_create("xfs_iul_item", > + sizeof(struct xfs_iunlink_item), > + 0, 0, NULL); > + if (!xfs_iunlink_zone) > + goto out_destroy_bui_zone; > + > return 0; > > + out_destroy_bui_zone: > + kmem_cache_destroy(xfs_bui_zone); > out_destroy_bud_zone: > kmem_cache_destroy(xfs_bud_zone); > out_destroy_cui_zone: > @@ -2003,6 +2012,7 @@ xfs_destroy_zones(void) > * destroy caches. > */ > rcu_barrier(); > + kmem_cache_destroy(xfs_iunlink_zone); > kmem_cache_destroy(xfs_bui_zone); > kmem_cache_destroy(xfs_bud_zone); > kmem_cache_destroy(xfs_cui_zone); > -- > 2.26.2.761.g0e0b3e54be >