The following calltrace is seen: #0 context_switch() kernel/sched/core.c:3881 #1 __schedule() kernel/sched/core.c:5111 #2 schedule() kernel/sched/core.c:5186 #3 xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598 #4 xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641 #5 xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828 #6 xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362 #7 xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029 #8 __xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067 #9 xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370 #10 xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626 #11 xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605 #12 xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893 #13 xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824 #14 xfs_log_mount_finish() fs/xfs/xfs_log.c:764 #15 xfs_mountfs() fs/xfs/xfs_mount.c:978 #16 xfs_fs_fill_super() fs/xfs/xfs_super.c:1908 #17 mount_bdev() fs/super.c:1417 #18 xfs_fs_mount() fs/xfs/xfs_super.c:1985 #19 legacy_get_tree() fs/fs_context.c:647 #20 vfs_get_tree() fs/super.c:1547 #21 do_new_mount() fs/namespace.c:2843 #22 do_mount() fs/namespace.c:3163 #23 ksys_mount() fs/namespace.c:3372 #24 __do_sys_mount() fs/namespace.c:3386 #25 __se_sys_mount() fs/namespace.c:3383 #26 __x64_sys_mount() fs/namespace.c:3383 #27 do_syscall_64() arch/x86/entry/common.c:296 #28 entry_SYSCALL_64() arch/x86/entry/entry_64.S:180 During the process of the 2nd and subsequetial record in an EFI. It is waiting for the busy blocks to be cleaned, but the only busy extent is still hold in current xfs_trans->t_busy. That busy extent was added when processing previous EFI record. And because that busy extent is not committed yet, it can't be cleaned. To avoid above deadlock, we don't block in xfs_extent_busy_flush() when allocating AGFL blocks, instead it returns -EAGAIN. On receiving -EAGAIN we are able to retry that EFI record with a new transaction after committing the old transactin. With old transaction committed, the busy extent attached to the old transaction get the change to be cleaned. On the retry, there is no existing busy extents in the new transaction, thus no deadlock. Signed-off-by: Wengang Wang <wen.gang.wang@xxxxxxxxxx> --- fs/xfs/libxfs/xfs_alloc.c | 30 ++++++++++++++++++++++++------ fs/xfs/libxfs/xfs_alloc.h | 2 ++ fs/xfs/scrub/repair.c | 4 ++-- fs/xfs/xfs_extent_busy.c | 34 +++++++++++++++++++++++++++++----- fs/xfs/xfs_extent_busy.h | 6 +++--- fs/xfs/xfs_extfree_item.c | 37 ++++++++++++++++++++++++++++++++++++- fs/xfs/xfs_log_recover.c | 23 ++++++++++------------- fs/xfs/xfs_trans_ail.c | 2 +- fs/xfs/xfs_trans_priv.h | 1 + 9 files changed, 108 insertions(+), 31 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 203f16c48c19..abfd2acb3053 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -1491,6 +1491,7 @@ STATIC int xfs_alloc_ag_vextent_near( struct xfs_alloc_arg *args) { + int flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH; struct xfs_alloc_cur acur = {}; int error; /* error code */ int i; /* result code, temporary */ @@ -1564,8 +1565,11 @@ xfs_alloc_ag_vextent_near( if (!acur.len) { if (acur.busy) { trace_xfs_alloc_near_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, - acur.busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + acur.busy_gen, flags); + if (error) + goto out; + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } trace_xfs_alloc_size_neither(args); @@ -1592,6 +1596,7 @@ STATIC int /* error */ xfs_alloc_ag_vextent_size( xfs_alloc_arg_t *args) /* allocation argument structure */ { + int flags = args->flags | XFS_ALLOC_FLAG_TRYFLUSH; struct xfs_agf *agf = args->agbp->b_addr; struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ @@ -1670,8 +1675,13 @@ xfs_alloc_ag_vextent_size( xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, - args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, flags); + if (error) { + cnt_cur = NULL; + goto error0; + } + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } } @@ -1755,7 +1765,13 @@ xfs_alloc_ag_vextent_size( if (busy) { xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); trace_xfs_alloc_size_busy(args); - xfs_extent_busy_flush(args->mp, args->pag, busy_gen); + error = xfs_extent_busy_flush(args->tp, args->pag, + busy_gen, flags); + if (error) { + cnt_cur = NULL; + goto error0; + } + flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; goto restart; } goto out_nominleft; @@ -2629,6 +2645,7 @@ xfs_alloc_fix_freelist( targs.agno = args->agno; targs.alignment = targs.minlen = targs.prod = 1; targs.pag = pag; + targs.flags = args->flags & XFS_ALLOC_FLAG_FREEING; error = xfs_alloc_read_agfl(pag, tp, &agflbp); if (error) goto out_agbp_relse; @@ -3572,6 +3589,7 @@ xfs_free_extent_fix_freelist( args.mp = tp->t_mountp; args.agno = pag->pag_agno; args.pag = pag; + args.flags = XFS_ALLOC_FLAG_FREEING; /* * validate that the block number is legal - the enables us to detect @@ -3580,7 +3598,7 @@ xfs_free_extent_fix_freelist( if (args.agno >= args.mp->m_sb.sb_agcount) return -EFSCORRUPTED; - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); + error = xfs_alloc_fix_freelist(&args, args.flags); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2b246d74c189..5038fba87784 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -24,6 +24,7 @@ unsigned int xfs_agfl_size(struct xfs_mount *mp); #define XFS_ALLOC_FLAG_NORMAP 0x00000004 /* don't modify the rmapbt */ #define XFS_ALLOC_FLAG_NOSHRINK 0x00000008 /* don't shrink the freelist */ #define XFS_ALLOC_FLAG_CHECK 0x00000010 /* test only, don't modify args */ +#define XFS_ALLOC_FLAG_TRYFLUSH 0x00000020 /* don't block in busyextent flush*/ /* * Argument structure for xfs_alloc routines. @@ -57,6 +58,7 @@ typedef struct xfs_alloc_arg { #ifdef DEBUG bool alloc_minlen_only; /* allocate exact minlen extent */ #endif + int flags; /* XFS_ALLOC_FLAG_* */ } xfs_alloc_arg_t; /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 1b71174ec0d6..2ba28e4257fe 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -496,9 +496,9 @@ xrep_fix_freelist( args.agno = sc->sa.pag->pag_agno; args.alignment = 1; args.pag = sc->sa.pag; + args.flags = can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK; - return xfs_alloc_fix_freelist(&args, - can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK); + return xfs_alloc_fix_freelist(&args, args.flags); } /* diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index f3d328e4a440..ea1c1857bf5b 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -567,18 +567,41 @@ xfs_extent_busy_clear( /* * Flush out all busy extents for this AG. */ -void +int xfs_extent_busy_flush( - struct xfs_mount *mp, + struct xfs_trans *tp, struct xfs_perag *pag, - unsigned busy_gen) + unsigned busy_gen, + int flags) { DEFINE_WAIT (wait); int error; - error = xfs_log_force(mp, XFS_LOG_SYNC); + error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC); if (error) - return; + return error; + + /* + * If we are holding busy extents, the caller may not want to block + * straight away. If we are being told just to try a flush or progress + * has been made since we last skipped a busy extent, return + * immediately to allow the caller to try again. If we are freeing + * extents, we might actually be holding the only free extents in the + * transaction busy list and the log force won't resolve that + * situation. In this case, return -EAGAIN in that case to tell the + * caller it needs to commit the busy extents it holds before retrying + * the extent free operation. + */ + if (!list_empty(&tp->t_busy)) { + if (flags & XFS_ALLOC_FLAG_TRYFLUSH) + return 0; + + if (busy_gen != READ_ONCE(pag->pagb_gen)) + return 0; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return -EAGAIN; + } do { prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE); @@ -588,6 +611,7 @@ xfs_extent_busy_flush( } while (1); finish_wait(&pag->pagb_wait, &wait); + return 0; } void diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h index 4a118131059f..edeedb92e0df 100644 --- a/fs/xfs/xfs_extent_busy.h +++ b/fs/xfs/xfs_extent_busy.h @@ -51,9 +51,9 @@ bool xfs_extent_busy_trim(struct xfs_alloc_arg *args, xfs_agblock_t *bno, xfs_extlen_t *len, unsigned *busy_gen); -void -xfs_extent_busy_flush(struct xfs_mount *mp, struct xfs_perag *pag, - unsigned busy_gen); +int +xfs_extent_busy_flush(struct xfs_trans *tp, struct xfs_perag *pag, + unsigned busy_gen, int flags); void xfs_extent_busy_wait_all(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index 011b50469301..3c5a9e9952ec 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -336,6 +336,25 @@ xfs_trans_get_efd( return efdp; } +/* + * Fill the EFD with all extents from the EFI and set the counter. + * Note: the EFD should comtain at least one extents already. + */ +static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp) +{ + struct xfs_efi_log_item *efip = efdp->efd_efip; + uint i; + + if (efdp->efd_next_extent == efip->efi_format.efi_nextents) + return; + + for (i = 0; i < efip->efi_format.efi_nextents; i++) { + efdp->efd_format.efd_extents[i] = + efip->efi_format.efi_extents[i]; + } + efdp->efd_next_extent = efip->efi_format.efi_nextents; +} + /* * Free an extent and log it to the EFD. Note that the transaction is marked * dirty regardless of whether the extent free succeeds or fails to support the @@ -369,6 +388,10 @@ xfs_trans_free_extent( error = __xfs_free_extent(tp, xefi->xefi_startblock, xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE, xefi->xefi_flags & XFS_EFI_SKIP_DISCARD); + if (error == -EAGAIN) { + xfs_fill_efd_with_efi(efdp); + return error; + } /* * Mark the transaction dirty, even on error. This ensures the * transaction is aborted, which: @@ -476,7 +499,8 @@ xfs_extent_free_finish_item( xefi = container_of(item, struct xfs_extent_free_item, xefi_list); error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi); - kmem_cache_free(xfs_extfree_item_cache, xefi); + if (error != -EAGAIN) + kmem_cache_free(xfs_extfree_item_cache, xefi); return error; } @@ -633,6 +657,17 @@ xfs_efi_item_recover( fake.xefi_blockcount = extp->ext_len; error = xfs_trans_free_extent(tp, efdp, &fake); + if (error == -EAGAIN) { + xfs_free_extent_later(tp, fake.xefi_startblock, + fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER); + /* + * try to free as many extents as possible with current + * transaction + */ + error = 0; + continue; + }; + if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 322eb2ee6c55..00bfe9683fa8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2540,30 +2540,27 @@ xlog_recover_process_intents( struct xfs_log_item *lip; struct xfs_ail *ailp; int error = 0; -#if defined(DEBUG) || defined(XFS_WARN) - xfs_lsn_t last_lsn; -#endif + xfs_lsn_t threshold_lsn; ailp = log->l_ailp; + threshold_lsn = xfs_ail_max_lsn(ailp); spin_lock(&ailp->ail_lock); -#if defined(DEBUG) || defined(XFS_WARN) - last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block); -#endif + for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL; lip = xfs_trans_ail_cursor_next(ailp, &cur)) { const struct xfs_item_ops *ops; + /* + * Orignal redo EFI could be splitted into new EFIs. Those + * new EFIs are supposed to be processed in capture_list. + * Stop here when original redo intents are done. + */ + if (XFS_LSN_CMP(threshold_lsn, lip->li_lsn) < 0) + break; if (!xlog_item_is_intent(lip)) break; - /* - * We should never see a redo item with a LSN higher than - * the last transaction we found in the log at the start - * of recovery. - */ - ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0); - /* * NOTE: If your intent processing routine can create more * deferred ops, you /must/ attach them to the capture list in diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 7d4109af193e..2825f55eca88 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -137,7 +137,7 @@ xfs_ail_min_lsn( /* * Return the maximum lsn held in the AIL, or zero if the AIL is empty. */ -static xfs_lsn_t +xfs_lsn_t xfs_ail_max_lsn( struct xfs_ail *ailp) { diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index d5400150358e..86b4f29b2a6e 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -106,6 +106,7 @@ void xfs_ail_push_all(struct xfs_ail *); void xfs_ail_push_all_sync(struct xfs_ail *); struct xfs_log_item *xfs_ail_min(struct xfs_ail *ailp); xfs_lsn_t xfs_ail_min_lsn(struct xfs_ail *ailp); +xfs_lsn_t xfs_ail_max_lsn(struct xfs_ail *ailp); struct xfs_log_item * xfs_trans_ail_cursor_first(struct xfs_ail *ailp, struct xfs_ail_cursor *cur, -- 2.21.0 (Apple Git-122.2)