On Fri, May 19, 2023 at 10:18:29AM -0700, Wengang Wang wrote: > The following calltrace is seen: > #0 context_switch() kernel/sched/core.c:3881 > #1 __schedule() kernel/sched/core.c:5111 > #2 schedule() kernel/sched/core.c:5186 > #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598 > #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641 > #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828 > #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362 > #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029 > #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067 > #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370 > #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626 > #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605 > #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893 > #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824 > #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764 > #15 xfs_mountfs() fs/xfs/xfs_mount.c:978 > #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908 > #17 mount_bdev() fs/super.c:1417 > #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985 > #19 legacy_get_tree() fs/fs_context.c:647 > #20 vfs_get_tree() fs/super.c:1547 > #21 do_new_mount() fs/namespace.c:2843 > #22 do_mount() fs/namespace.c:3163 > #23 ksys_mount() fs/namespace.c:3372 > #24 __do_sys_mount() fs/namespace.c:3386 > #25 __se_sys_mount() fs/namespace.c:3383 > #26 __x64_sys_mount() fs/namespace.c:3383 > #27 do_syscall_64() arch/x86/entry/common.c:296 > #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180 > > During the process of the 2nd and subsequetial record in an EFI. > It is waiting for the busy blocks to be cleaned, but the only busy extent > is still hold in current xfs_trans->t_busy. That busy extent was added when > processing previous EFI record. And because that busy extent is not committed > yet, it can't be cleaned. > > To avoid above deadlock, we don't block in xfs_extent_busy_flush() when > allocating AGFL blocks, instead it returns -EAGAIN. On receiving -EAGAIN > we are able to retry that EFI record with a new transaction after committing > the old transactin. With old transaction committed, the busy extent attached > to the old transaction get the change to be cleaned. On the retry, there is > no existing busy extents in the new transaction, thus no deadlock. > > Signed-off-by: Wengang Wang <wen.gang.wang@xxxxxxxxxx> > --- > fs/xfs/libxfs/xfs_alloc.c | 30 ++++++++++++++++++++++++------ > fs/xfs/libxfs/xfs_alloc.h | 2 ++ > fs/xfs/scrub/repair.c | 4 ++-- > fs/xfs/xfs_extent_busy.c | 34 +++++++++++++++++++++++++++++----- > fs/xfs/xfs_extent_busy.h | 6 +++--- > fs/xfs/xfs_extfree_item.c | 37 ++++++++++++++++++++++++++++++++++++- > fs/xfs/xfs_log_recover.c | 23 ++++++++++------------- > fs/xfs/xfs_trans_ail.c | 2 +- > fs/xfs/xfs_trans_priv.h | 1 + > 9 files changed, 108 insertions(+), 31 deletions(-) > > diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c > index 203f16c48c19..abfd2acb3053 100644 > --- a/fs/xfs/libxfs/xfs_alloc.c > +++ b/fs/xfs/libxfs/xfs_alloc.c > @@ -1491,6 +1491,7 @@ STATIC int > xfs_alloc_ag_vextent_near( > struct xfs_alloc_arg *args) > { > + int flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH; > struct xfs_alloc_cur acur = {}; > int error; /* error code */ > int i; /* result code, temporary */ > @@ -1564,8 +1565,11 @@ xfs_alloc_ag_vextent_near( > if (!acur.len) { > if (acur.busy) { > trace_xfs_alloc_near_busy(args); > - xfs_extent_busy_flush(args->mp, args->pag, > - acur.busy_gen); > + error = xfs_extent_busy_flush(args->tp, args->pag, > + acur.busy_gen, flags); > + if (error) > + goto out; > + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; > goto restart; > } > trace_xfs_alloc_size_neither(args); > @@ -1592,6 +1596,7 @@ STATIC int /* error */ > xfs_alloc_ag_vextent_size( > xfs_alloc_arg_t *args) /* allocation argument structure */ > { > + int flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH; This variable holds XFS_ALLOC_FLAG_* values that are passed only to xfs_extent_busy_flush, right? Could this be named "busyflags" or similar to make that clearer? (Clearer to 6hr^W6mo from now Darrick who won't remember this at all.) > struct xfs_agf *agf = args->agbp->b_addr; > struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ > struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ > @@ -1670,8 +1675,13 @@ xfs_alloc_ag_vextent_size( > xfs_btree_del_cursor(cnt_cur, > XFS_BTREE_NOERROR); > trace_xfs_alloc_size_busy(args); > - xfs_extent_busy_flush(args->mp, > - args->pag, busy_gen); > + error = xfs_extent_busy_flush(args->tp, args->pag, > + busy_gen, flags); > + if (error) { > + cnt_cur = NULL; > + goto error0; > + } > + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; > goto restart; > } > } > @@ -1755,7 +1765,13 @@ xfs_alloc_ag_vextent_size( > if (busy) { > xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); > trace_xfs_alloc_size_busy(args); > - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); > + error = xfs_extent_busy_flush(args->tp, args->pag, > + busy_gen, flags); > + if (error) { > + cnt_cur = NULL; > + goto error0; > + } > + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; > goto restart; > } > goto out_nominleft; > @@ -2629,6 +2645,7 @@ xfs_alloc_fix_freelist( > targs.agno = args->agno; > targs.alignment = targs.minlen = targs.prod = 1; > targs.pag = pag; > + targs.flags = args->flags & XFS_ALLOC_FLAG_FREEING; Hmm, I guess this propagates FREEING into the allocation that lengthens the AGFL. Right? > error = xfs_alloc_read_agfl(pag, tp, &agflbp); > if (error) > goto out_agbp_relse; > @@ -3572,6 +3589,7 @@ xfs_free_extent_fix_freelist( > args.mp = tp->t_mountp; > args.agno = pag->pag_agno; > args.pag = pag; > + args.flags = XFS_ALLOC_FLAG_FREEING; > > /* > * validate that the block number is legal - the enables us to detect > @@ -3580,7 +3598,7 @@ xfs_free_extent_fix_freelist( > if (args.agno >= args.mp->m_sb.sb_agcount) > return -EFSCORRUPTED; > > - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); > + error = xfs_alloc_fix_freelist(&args, args.flags); Hm. Why stuff XFS_ALLOC_FLAG_FREEING into args.flags here? I /think/ in this case it doesn't matter because nothing under xfs_alloc_fix_freelist accesses args->flags...? AFAICT in xfs_alloc_fix_freelist, the difference between @args->flags and @flags is that @args is the allocation that we're trying to do, whereas @flags are for any allocation that might need to be done to fix the freelist. IOWS, the only allocation we're doing *is* to fix the freelist, so they are one in the same. Maybe? Otherwise I can't tell why we'd convey two sets of XFS_ALLOC flags to xfs_alloc_fix_freelist. So while I think this isn't incorrect, the flag mixing bothers me a bit because setting fields in a struct can have weird not so obvious side effects. (Still trying to make sense of the addition of a @flags field to struct xfs_alloc_arg.) > if (error) > return error; > > diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h > index 2b246d74c189..5038fba87784 100644 > --- a/fs/xfs/libxfs/xfs_alloc.h > +++ b/fs/xfs/libxfs/xfs_alloc.h > @@ -24,6 +24,7 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp); > #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ > #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ > #define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ > +#define XFS_ALLOC_FLAG_TRYFLUSH 0x00000020 /* don't block in busyextent flush*/ > > /* > * Argument structure for xfs_alloc routines. > @@ -57,6 +58,7 @@ typedef struct xfs_alloc_arg { > #ifdef DEBUG > bool alloc_minlen_only; /* allocate exact minlen extent */ > #endif > + int flags; /* XFS_ALLOC_FLAG_* */ > } xfs_alloc_arg_t; > > /* > diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c > index 1b71174ec0d6..2ba28e4257fe 100644 > --- a/fs/xfs/scrub/repair.c > +++ b/fs/xfs/scrub/repair.c > @@ -496,9 +496,9 @@ xrep_fix_freelist( > args.agno = sc->sa.pag->pag_agno; > args.alignment = 1; > args.pag = sc->sa.pag; > + args.flags = can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK; > > - return xfs_alloc_fix_freelist(&args, > - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); > + return xfs_alloc_fix_freelist(&args, args.flags); > } > > /* > diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c > index f3d328e4a440..ea1c1857bf5b 100644 > --- a/fs/xfs/xfs_extent_busy.c > +++ b/fs/xfs/xfs_extent_busy.c > @@ -567,18 +567,41 @@ xfs_extent_busy_clear( > /* > * Flush out all busy extents for this AG. > */ > -void > +int > xfs_extent_busy_flush( > - struct xfs_mount *mp, > + struct xfs_trans *tp, > struct xfs_perag *pag, > - unsigned busy_gen) > + unsigned busy_gen, > + int flags) > { > DEFINE_WAIT (wait); > int error; > > - error = xfs_log_force(mp, XFS_LOG_SYNC); > + error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); > if (error) > - return; > + return error; > + > + /* > + * If we are holding busy extents, the caller may not want to block > + * straight away. If we are being told just to try a flush or progress > + * has been made since we last skipped a busy extent, return > + * immediately to allow the caller to try again. If we are freeing > + * extents, we might actually be holding the only free extents in the > + * transaction busy list and the log force won't resolve that > + * situation. In this case, return -EAGAIN in that case to tell the > + * caller it needs to commit the busy extents it holds before retrying > + * the extent free operation. > + */ > + if (!list_empty(&tp->t_busy)) { > + if (flags & XFS_ALLOC_FLAG_TRYFLUSH) > + return 0; > + > + if (busy_gen != READ_ONCE(pag->pagb_gen)) > + return 0; > + > + if (flags & XFS_ALLOC_FLAG_FREEING) > + return -EAGAIN; > + } > > do { > prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); > @@ -588,6 +611,7 @@ xfs_extent_busy_flush( > } while (1); > > finish_wait(&pag->pagb_wait, &wait); > + return 0; This part looks good. > } > > void > diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h > index 4a118131059f..edeedb92e0df 100644 > --- a/fs/xfs/xfs_extent_busy.h > +++ b/fs/xfs/xfs_extent_busy.h > @@ -51,9 +51,9 @@ bool > xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, > xfs_extlen_t *len, unsigned *busy_gen); > > -void > -xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, > - unsigned busy_gen); > +int > +xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, > + unsigned busy_gen, int flags); > > void > xfs_extent_busy_wait_all(struct xfs_mount *mp); > diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c > index 011b50469301..3c5a9e9952ec 100644 > --- a/fs/xfs/xfs_extfree_item.c > +++ b/fs/xfs/xfs_extfree_item.c > @@ -336,6 +336,25 @@ xfs_trans_get_efd( > return efdp; > } > > +/* > + * Fill the EFD with all extents from the EFI and set the counter. > + * Note: the EFD should comtain at least one extents already. > + */ > +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp) > +{ > + struct xfs_efi_log_item *efip = efdp->efd_efip; > + uint i; > + > + if (efdp->efd_next_extent == efip->efi_format.efi_nextents) > + return; > + > + for (i = 0; i < efip->efi_format.efi_nextents; i++) { > + efdp->efd_format.efd_extents[i] = > + efip->efi_format.efi_extents[i]; > + } > + efdp->efd_next_extent = efip->efi_format.efi_nextents; > +} > + > /* > * Free an extent and log it to the EFD. Note that the transaction is marked > * dirty regardless of whether the extent free succeeds or fails to support the > @@ -369,6 +388,10 @@ xfs_trans_free_extent( > error = __xfs_free_extent(tp, xefi->xefi_startblock, > xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, > xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); > + if (error == -EAGAIN) { > + xfs_fill_efd_with_efi(efdp); > + return error; > + } > /* > * Mark the transaction dirty, even on error. This ensures the > * transaction is aborted, which: > @@ -476,7 +499,8 @@ xfs_extent_free_finish_item( > xefi = container_of(item, struct xfs_extent_free_item, xefi_list); > > error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); > - kmem_cache_free(xfs_extfree_item_cache, xefi); > + if (error != -EAGAIN) > + kmem_cache_free(xfs_extfree_item_cache, xefi); > return error; > } > > @@ -633,6 +657,17 @@ xfs_efi_item_recover( > fake.xefi_blockcount = extp->ext_len; > > error = xfs_trans_free_extent(tp, efdp, &fake); > + if (error == -EAGAIN) { Curious, is this patch based off of 6.4-rc? There should be a xfs_extent_free_get_group / xfs_extent_free_put_group here. > + xfs_free_extent_later(tp, fake.xefi_startblock, > + fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER); > + /* > + * try to free as many extents as possible with current > + * transaction > + */ > + error = 0; > + continue; > + }; If we get EAGAIN, why not let the unfinished parts of the recovered EFI work get turned into a new EFI? > + > if (error == -EFSCORRUPTED) > XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, > extp, sizeof(*extp)); > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c > index 322eb2ee6c55..00bfe9683fa8 100644 > --- a/fs/xfs/xfs_log_recover.c > +++ b/fs/xfs/xfs_log_recover.c > @@ -2540,30 +2540,27 @@ xlog_recover_process_intents( > struct xfs_log_item *lip; > struct xfs_ail *ailp; > int error = 0; > -#if defined(DEBUG) || defined(XFS_WARN) > - xfs_lsn_t last_lsn; > -#endif > + xfs_lsn_t threshold_lsn; > > ailp = log->l_ailp; > + threshold_lsn = xfs_ail_max_lsn(ailp); > spin_lock(&ailp->ail_lock); > -#if defined(DEBUG) || defined(XFS_WARN) > - last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); > -#endif > + > for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); > lip != NULL; > lip = xfs_trans_ail_cursor_next(ailp, &cur)) { > const struct xfs_item_ops *ops; > + /* > + * Orignal redo EFI could be splitted into new EFIs. Those > + * new EFIs are supposed to be processed in capture_list. > + * Stop here when original redo intents are done. > + */ > + if (XFS_LSN_CMP(threshold_lsn, lip->li_lsn) < 0) > + break; I'm not sure what this change accomplishes? Is it because of the continue; statement in xfs_efi_item_recover? --D > > if (!xlog_item_is_intent(lip)) > break; > > - /* > - * We should never see a redo item with a LSN higher than > - * the last transaction we found in the log at the start > - * of recovery. > - */ > - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); > - > /* > * NOTE: If your intent processing routine can create more > * deferred ops, you /must/ attach them to the capture list in > diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c > index 7d4109af193e..2825f55eca88 100644 > --- a/fs/xfs/xfs_trans_ail.c > +++ b/fs/xfs/xfs_trans_ail.c > @@ -137,7 +137,7 @@ xfs_ail_min_lsn( > /* > * Return the maximum lsn held in the AIL, or zero if the AIL is empty. > */ > -static xfs_lsn_t > +xfs_lsn_t > xfs_ail_max_lsn( > struct xfs_ail *ailp) > { > diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h > index d5400150358e..86b4f29b2a6e 100644 > --- a/fs/xfs/xfs_trans_priv.h > +++ b/fs/xfs/xfs_trans_priv.h > @@ -106,6 +106,7 @@ void xfs_ail_push_all(struct xfs_ail *); > void xfs_ail_push_all_sync(struct xfs_ail *); > struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); > xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); > +xfs_lsn_t xfs_ail_max_lsn(struct xfs_ail *ailp); > > struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, > struct xfs_ail_cursor *cur, > -- > 2.21.0 (Apple Git-122.2) >