[PATCH] xfs: avoid freeing multiple extents from same AG in pending transactions

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



To avoid possible deadlock when allocating AGFL blocks, change the behaviour.
The orignal hehaviour for freeing extents in an EFI is that the extents in
the EFI were processed one by one in the same transaction. When the second
and subsequent extents are being processed, we have produced busy extents for
previous extents. If the second and subsequent extents are from the same AG
as the busy extents are, we have the risk of deadlock when allocating AGFL
blocks. A typical calltrace for the deadlock is like this:

	#0	context_switch() kernel/sched/core.c:3881
	#1	__schedule() kernel/sched/core.c:5111
	#2	schedule() kernel/sched/core.c:5186
	#3	xfs_extent_busy_flush() fs/xfs/xfs_extent_busy.c:598
	#4	xfs_alloc_ag_vextent_size() fs/xfs/libxfs/xfs_alloc.c:1641
	#5	xfs_alloc_ag_vextent() fs/xfs/libxfs/xfs_alloc.c:828
	#6	xfs_alloc_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:2362
	#7	xfs_free_extent_fix_freelist() fs/xfs/libxfs/xfs_alloc.c:3029
	#8	__xfs_free_extent() fs/xfs/libxfs/xfs_alloc.c:3067
	#9	xfs_trans_free_extent() fs/xfs/xfs_extfree_item.c:370
	#10	xfs_efi_recover() fs/xfs/xfs_extfree_item.c:626
	#11	xlog_recover_process_efi() fs/xfs/xfs_log_recover.c:4605
	#12	xlog_recover_process_intents() fs/xfs/xfs_log_recover.c:4893
	#13	xlog_recover_finish() fs/xfs/xfs_log_recover.c:5824
	#14	xfs_log_mount_finish() fs/xfs/xfs_log.c:764
	#15	xfs_mountfs() fs/xfs/xfs_mount.c:978
	#16	xfs_fs_fill_super() fs/xfs/xfs_super.c:1908
	#17	mount_bdev() fs/super.c:1417
	#18	xfs_fs_mount() fs/xfs/xfs_super.c:1985
	#19	legacy_get_tree() fs/fs_context.c:647
	#20	vfs_get_tree() fs/super.c:1547
	#21	do_new_mount() fs/namespace.c:2843
	#22	do_mount() fs/namespace.c:3163
	#23	ksys_mount() fs/namespace.c:3372
	#24	__do_sys_mount() fs/namespace.c:3386
	#25	__se_sys_mount() fs/namespace.c:3383
	#26	__x64_sys_mount() fs/namespace.c:3383
	#27	do_syscall_64() arch/x86/entry/common.c:296
	#28	entry_SYSCALL_64() arch/x86/entry/entry_64.S:180

The deadlock could happen at both IO time and log recover time.

To avoid above deadlock, this patch changes the extent free procedure.
1) it always let the first extent from the EFI go (no change).
2) increase the (new) AG counter when it let a extent go.
3) for the 2nd+ extents, if the owning AGs ready have pending extents
   don't let the extent go with current transaction. Instead, move the
   extent in question and subsequent extents to a new EFI and try the new
   EFI again with new transaction (by rolling current transaction).
4) for the EFD to orginal EFI, fill it with all the extents from the original
   EFI.
5) though the new EFI is placed after original EFD, it's safe as they are in
   same in-memory transaction.
6) The new AG counter for pending extent freeings is decremented after the
   log items in in-memory transaction is committed to CIL.

Signed-off-by: Wengang Wang <wen.gang.wang@xxxxxxxxxx>
---
 fs/xfs/libxfs/xfs_ag.c    |   1 +
 fs/xfs/libxfs/xfs_ag.h    |   5 ++
 fs/xfs/xfs_extfree_item.c | 111 +++++++++++++++++++++++++++++++++++++-
 fs/xfs/xfs_log_cil.c      |  24 ++++++++-
 4 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 86696a1c6891..61ef61e05668 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -378,6 +378,7 @@ xfs_initialize_perag(
 		pag->pagb_tree = RB_ROOT;
 #endif /* __KERNEL__ */
 
+		atomic_set(&pag->pag_nr_pending_extents, 0);
 		error = xfs_buf_hash_init(pag);
 		if (error)
 			goto out_remove_pag;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 5e18536dfdce..5950bc36a32c 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -82,6 +82,11 @@ struct xfs_perag {
 	uint16_t	pag_sick;
 	spinlock_t	pag_state_lock;
 
+	/*
+	 * Number of concurrent extent freeings (not committed to CIL yet)
+	 * on this AG.
+	 */
+	atomic_t	pag_nr_pending_extents;
 	spinlock_t	pagb_lock;	/* lock for pagb_tree */
 	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
 	unsigned int	pagb_gen;	/* generation count for pagb_tree */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 011b50469301..1dbf36d9c1c9 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -336,6 +336,75 @@ xfs_trans_get_efd(
 	return efdp;
 }
 
+/*
+ * Fill the EFD with all extents from the EFI and set the counter.
+ * Note: the EFD should comtain at least one extents already.
+ */
+static void xfs_fill_efd_with_efi(struct xfs_efd_log_item *efdp)
+{
+	struct xfs_efi_log_item	*efip = efdp->efd_efip;
+	uint			i;
+
+	i = efdp->efd_next_extent;
+	ASSERT(i > 0);
+	for (; i < efip->efi_format.efi_nextents; i++) {
+		efdp->efd_format.efd_extents[i] =
+			efip->efi_format.efi_extents[i];
+	}
+	efdp->efd_next_extent = i;
+}
+
+/*
+ * Check if xefi is the first in the efip.
+ * Returns true if so, ad false otherwise
+ */
+static bool xfs_is_first_extent_in_efi(struct xfs_efi_log_item *efip,
+				  struct xfs_extent_free_item *xefi)
+{
+	return efip->efi_format.efi_extents[0].ext_start ==
+					xefi->xefi_startblock;
+}
+
+/*
+ * Check if the xefi needs to be in a new transaction.
+ * efip is the containing EFI of xefi.
+ * Return true if so, false otherwise.
+ */
+static bool xfs_extent_free_need_new_trans(struct xfs_mount *mp,
+				    struct xfs_efi_log_item *efip,
+				    struct xfs_extent_free_item *xefi)
+{
+	bool			ret = true;
+	int			nr_pre;
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+
+	agno = XFS_FSB_TO_AGNO(mp, xefi->xefi_startblock);
+	pag = xfs_perag_get(mp, agno);
+	/* The first extent in EFI is always OK to go */
+	if (xfs_is_first_extent_in_efi(efip, xefi)) {
+		atomic_inc(&pag->pag_nr_pending_extents);
+		ret = false;
+		goto out_put;
+	}
+
+	/*
+	 * Now the extent is the 2nd or subsequent in the efip. We need
+	 * new transaction if the AG already has busy extents pending.
+	 */
+	nr_pre = atomic_inc_return(&pag->pag_nr_pending_extents) - 1;
+	/* No prevoius pending extent freeing to this AG */
+	if (nr_pre == 0) {
+		ret = false;
+		goto out_put;
+	}
+
+	atomic_dec(&pag->pag_nr_pending_extents);
+out_put:
+	xfs_perag_put(pag);
+	return ret;
+}
+
 /*
  * Free an extent and log it to the EFD. Note that the transaction is marked
  * dirty regardless of whether the extent free succeeds or fails to support the
@@ -356,6 +425,28 @@ xfs_trans_free_extent(
 	xfs_agblock_t			agbno = XFS_FSB_TO_AGBNO(mp,
 							xefi->xefi_startblock);
 	int				error;
+	struct xfs_efi_log_item		*efip = efdp->efd_efip;
+
+	if (xfs_extent_free_need_new_trans(mp, efip, xefi)) {
+		/*
+		 * This should be the 2nd+ extent, we don't have to mark the
+		 * transaction and efd dirty, those are already done with the
+		 * first extent.
+		 */
+		ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
+		ASSERT(tp->t_flags & XFS_TRANS_HAS_INTENT_DONE);
+		ASSERT(test_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags));
+
+		xfs_fill_efd_with_efi(efdp);
+
+		/*
+		 * A preious extent in same AG is processed with the current
+		 * transaction. To avoid possible AGFL allocation deadlock,
+		 * we roll the transaction and then restart with this extent
+		 * with new transaction.
+		 */
+		return -EAGAIN;
+	}
 
 	oinfo.oi_owner = xefi->xefi_owner;
 	if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
@@ -369,6 +460,13 @@ xfs_trans_free_extent(
 	error = __xfs_free_extent(tp, xefi->xefi_startblock,
 			xefi->xefi_blockcount, &oinfo, XFS_AG_RESV_NONE,
 			xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+	if (error) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, agno);
+		atomic_dec(&pag->pag_nr_pending_extents);
+		xfs_perag_put(pag);
+	}
 	/*
 	 * Mark the transaction dirty, even on error. This ensures the
 	 * transaction is aborted, which:
@@ -476,7 +574,8 @@ xfs_extent_free_finish_item(
 	xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
 
 	error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
-	kmem_cache_free(xfs_extfree_item_cache, xefi);
+	if (error != -EAGAIN)
+		kmem_cache_free(xfs_extfree_item_cache, xefi);
 	return error;
 }
 
@@ -632,7 +731,15 @@ xfs_efi_item_recover(
 		fake.xefi_startblock = extp->ext_start;
 		fake.xefi_blockcount = extp->ext_len;
 
-		error = xfs_trans_free_extent(tp, efdp, &fake);
+		if (error == 0)
+			error = xfs_trans_free_extent(tp, efdp, &fake);
+
+		if (error == -EAGAIN) {
+			ASSERT(i > 0);
+			xfs_free_extent_later(tp, fake.xefi_startblock,
+			fake.xefi_blockcount, &XFS_RMAP_OINFO_ANY_OWNER);
+			continue;
+		}
 		if (error == -EFSCORRUPTED)
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
 					extp, sizeof(*extp));
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index eccbfb99e894..97eda4487db0 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -16,6 +16,7 @@
 #include "xfs_log.h"
 #include "xfs_log_priv.h"
 #include "xfs_trace.h"
+#include "xfs_ag.h"
 
 struct workqueue_struct *xfs_discard_wq;
 
@@ -643,8 +644,29 @@ xlog_cil_insert_items(
 		cilpcp->space_used += len;
 	}
 	/* attach the transaction to the CIL if it has any busy extents */
-	if (!list_empty(&tp->t_busy))
+	if (!list_empty(&tp->t_busy)) {
+		struct xfs_perag	*last_pag = NULL;
+		xfs_agnumber_t		last_agno = -1;
+		struct xfs_extent_busy	*ebp;
+
+		/*
+		 * Pending extent freeings are committed to CIL, now it's
+		 * to let other extent freeing on same AG go.
+		 */
+		list_for_each_entry(ebp, &tp->t_busy, list) {
+			if (ebp->agno != last_agno) {
+				last_agno = ebp->agno;
+				if (last_pag)
+					xfs_perag_put(last_pag);
+				last_pag = xfs_perag_get(tp->t_mountp, last_agno);
+			}
+			atomic_dec(&last_pag->pag_nr_pending_extents);
+		}
+		if (last_pag)
+			xfs_perag_put(last_pag);
+
 		list_splice_init(&tp->t_busy, &cilpcp->busy_extents);
+	}
 
 	/*
 	 * Now update the order of everything modified in the transaction
-- 
2.21.0 (Apple Git-122.2)




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux