[PATCH 7/7] xfs: iscan batching should handle unallocated inodes too

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Darrick J. Wong <djwong@xxxxxxxxxx>

The inode scanner tries to reduce contention on the AGI header buffer
lock by grabbing references to consecutive allocated inodes.  Batching
stops as soon as we encounter an unallocated inode.  This is unfortunate
because in the worst case performance collapses to the old "one at a
time" behavior if every other inode is free.

This is correct behavior, but we could do better.  Unallocated inodes by
definition have nothing to scan, which means the iscan can ignore them
as long as someone ensures that the scan data will reflect another
thread allocating the inode and adding interesting metadata to that
inode.  That mechanism is, of course, the live update hooks.

Therefore, extend the batching mechanism to track unallocated inodes
adjacent to the scan cursor.  The _want_live_update predicate can tell
the caller's live update hook to incorporate all live updates to what
the scanner thinks is an unallocated inode if (after dropping the AGI)
some other thread allocates one of those inodes and begins using it.

Note that we cannot just copy the ir_free bitmap into the scan cursor
because the batching stops if iget says the inode is in an intermediate
state (e.g. on the inactivation list) and cannot be igrabbed.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
 fs/xfs/scrub/iscan.c |  107 +++++++++++++++++++++++++++++++++++++++++++++-----
 fs/xfs/scrub/iscan.h |    6 ++-
 fs/xfs/scrub/trace.h |   21 ++++++++--
 3 files changed, 119 insertions(+), 15 deletions(-)


diff --git a/fs/xfs/scrub/iscan.c b/fs/xfs/scrub/iscan.c
index ba93258c47030..4e570b5d98b1b 100644
--- a/fs/xfs/scrub/iscan.c
+++ b/fs/xfs/scrub/iscan.c
@@ -61,7 +61,8 @@ xchk_iscan_find_next(
 	struct xfs_buf		*agi_bp,
 	struct xfs_perag	*pag,
 	xfs_inofree_t		*allocmaskp,
-	xfs_agino_t		*cursor)
+	xfs_agino_t		*cursor,
+	uint8_t			*nr_inodesp)
 {
 	struct xfs_scrub	*sc = iscan->sc;
 	struct xfs_inobt_rec_incore	rec;
@@ -147,6 +148,7 @@ xchk_iscan_find_next(
 			ASSERT(next >= 0);
 			*cursor = rec.ir_startino + next;
 			*allocmaskp = allocmask >> next;
+			*nr_inodesp = XFS_INODES_PER_CHUNK - next;
 			break;
 		}
 	}
@@ -228,7 +230,8 @@ xchk_iscan_advance(
 	struct xchk_iscan	*iscan,
 	struct xfs_perag	**pagp,
 	struct xfs_buf		**agi_bpp,
-	xfs_inofree_t		*allocmaskp)
+	xfs_inofree_t		*allocmaskp,
+	uint8_t			*nr_inodesp)
 {
 	struct xfs_scrub	*sc = iscan->sc;
 	struct xfs_mount	*mp = sc->mp;
@@ -255,7 +258,7 @@ xchk_iscan_advance(
 
 		agino = XFS_INO_TO_AGINO(mp, iscan->cursor_ino);
 		ret = xchk_iscan_find_next(iscan, agi_bp, pag, allocmaskp,
-				&agino);
+				&agino, nr_inodesp);
 		if (ret)
 			goto out_buf;
 
@@ -344,12 +347,14 @@ xchk_iscan_iget(
 	struct xchk_iscan	*iscan,
 	struct xfs_perag	*pag,
 	struct xfs_buf		*agi_bp,
-	xfs_inofree_t		allocmask)
+	xfs_inofree_t		allocmask,
+	uint8_t			nr_inodes)
 {
 	struct xfs_scrub	*sc = iscan->sc;
 	struct xfs_mount	*mp = sc->mp;
 	xfs_ino_t		ino = iscan->cursor_ino;
 	unsigned int		idx = 0;
+	unsigned int		i;
 	int			error;
 
 	ASSERT(iscan->__inodes[0] == NULL);
@@ -399,10 +404,28 @@ xchk_iscan_iget(
 	/*
 	 * Now that we've filled the first slot in __inodes, try to fill the
 	 * rest of the batch with consecutively ordered inodes.  to reduce the
-	 * number of _iter calls.  If we can't get an inode, we stop and return
-	 * what we have.
+	 * number of _iter calls.  Make a bitmap of unallocated inodes from the
+	 * zeroes in the inuse bitmap; these inodes will not be scanned, but
+	 * the _want_live_update predicate will pass through all live updates.
+	 *
+	 * If we can't iget an allocated inode, stop and return what we have.
 	 */
-	for (; allocmask & 1; allocmask >>= 1, ino++, idx++) {
+	mutex_lock(&iscan->lock);
+	iscan->__batch_ino = ino - 1;
+	iscan->__skipped_inomask = 0;
+	mutex_unlock(&iscan->lock);
+
+	for (i = 1; i < nr_inodes; i++, ino++, allocmask >>= 1) {
+		if (!(allocmask & 1)) {
+			ASSERT(!(iscan->__skipped_inomask & (1ULL << i)));
+
+			mutex_lock(&iscan->lock);
+			iscan->cursor_ino = ino;
+			iscan->__skipped_inomask |= (1ULL << i);
+			mutex_unlock(&iscan->lock);
+			continue;
+		}
+
 		ASSERT(iscan->__inodes[idx] == NULL);
 
 		error = xfs_iget(sc->mp, sc->tp, ino, XFS_IGET_NORETRY, 0,
@@ -413,14 +436,42 @@ xchk_iscan_iget(
 		mutex_lock(&iscan->lock);
 		iscan->cursor_ino = ino;
 		mutex_unlock(&iscan->lock);
+		idx++;
 	}
 
-	trace_xchk_iscan_iget_batch(sc->mp, iscan, idx);
+	trace_xchk_iscan_iget_batch(sc->mp, iscan, nr_inodes, idx);
 	xfs_trans_brelse(sc->tp, agi_bp);
 	xfs_perag_put(pag);
 	return idx;
 }
 
+/*
+ * Advance the visit cursor to reflect skipped inodes beyond whatever we
+ * scanned.
+ */
+STATIC void
+xchk_iscan_finish_batch(
+	struct xchk_iscan	*iscan)
+{
+	xfs_ino_t		highest_skipped;
+
+	mutex_lock(&iscan->lock);
+
+	if (iscan->__batch_ino != NULLFSINO) {
+		highest_skipped = iscan->__batch_ino +
+					xfs_highbit64(iscan->__skipped_inomask);
+		iscan->__visited_ino = max(iscan->__visited_ino,
+					   highest_skipped);
+
+		trace_xchk_iscan_skip(iscan);
+	}
+
+	iscan->__batch_ino = NULLFSINO;
+	iscan->__skipped_inomask = 0;
+
+	mutex_unlock(&iscan->lock);
+}
+
 /*
  * Advance the inode scan cursor to the next allocated inode and return up to
  * 64 consecutive allocated inodes starting with the cursor position.
@@ -432,6 +483,8 @@ xchk_iscan_iter_batch(
 	struct xfs_scrub	*sc = iscan->sc;
 	int			ret;
 
+	xchk_iscan_finish_batch(iscan);
+
 	if (iscan->iget_timeout)
 		iscan->__iget_deadline = jiffies +
 					 msecs_to_jiffies(iscan->iget_timeout);
@@ -440,8 +493,10 @@ xchk_iscan_iter_batch(
 		struct xfs_buf	*agi_bp = NULL;
 		struct xfs_perag *pag = NULL;
 		xfs_inofree_t	allocmask = 0;
+		uint8_t		nr_inodes = 0;
 
-		ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask);
+		ret = xchk_iscan_advance(iscan, &pag, &agi_bp, &allocmask,
+				&nr_inodes);
 		if (ret != 1)
 			return ret;
 
@@ -452,7 +507,7 @@ xchk_iscan_iter_batch(
 			break;
 		}
 
-		ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask);
+		ret = xchk_iscan_iget(iscan, pag, agi_bp, allocmask, nr_inodes);
 	} while (ret == -EAGAIN);
 
 	return ret;
@@ -559,6 +614,9 @@ xchk_iscan_start(
 
 	start_ino = xchk_iscan_rotor(sc->mp);
 
+	iscan->__batch_ino = NULLFSINO;
+	iscan->__skipped_inomask = 0;
+
 	iscan->sc = sc;
 	clear_bit(XCHK_ISCAN_OPSTATE_ABORTED, &iscan->__opstate);
 	iscan->iget_timeout = iget_timeout;
@@ -587,6 +645,26 @@ xchk_iscan_mark_visited(
 	mutex_unlock(&iscan->lock);
 }
 
+/*
+ * Did we skip this inode because it wasn't allocated when we loaded the batch?
+ * If so, it is newly allocated and will not be scanned.  All live updates to
+ * this inode must be passed to the caller to maintain scan correctness.
+ */
+static inline bool
+xchk_iscan_skipped(
+	const struct xchk_iscan	*iscan,
+	xfs_ino_t		ino)
+{
+	if (iscan->__batch_ino == NULLFSINO)
+		return false;
+	if (ino < iscan->__batch_ino)
+		return false;
+	if (ino >= iscan->__batch_ino + XFS_INODES_PER_CHUNK)
+		return false;
+
+	return iscan->__skipped_inomask & (1ULL << (ino - iscan->__batch_ino));
+}
+
 /*
  * Do we need a live update for this inode?  This is true if the scanner thread
  * has visited this inode and the scan hasn't been aborted due to errors.
@@ -622,6 +700,15 @@ xchk_iscan_want_live_update(
 		goto unlock;
 	}
 
+	/*
+	 * This inode was not allocated at the time of the iscan batch.
+	 * The caller should receive all updates.
+	 */
+	if (xchk_iscan_skipped(iscan, ino)) {
+		ret = true;
+		goto unlock;
+	}
+
 	/*
 	 * The visited cursor hasn't yet wrapped around the end of the FS.  If
 	 * @ino is inside the starred range, the caller should receive updates:
diff --git a/fs/xfs/scrub/iscan.h b/fs/xfs/scrub/iscan.h
index f7317af807ddc..365d54e35cd94 100644
--- a/fs/xfs/scrub/iscan.h
+++ b/fs/xfs/scrub/iscan.h
@@ -44,8 +44,12 @@ struct xchk_iscan {
 
 	/*
 	 * The scan grabs batches of inodes and stashes them here before
-	 * handing them out with _iter.
+	 * handing them out with _iter.  Unallocated inodes are set in the
+	 * mask so that all updates to that inode are selected for live
+	 * update propagation.
 	 */
+	xfs_ino_t		__batch_ino;
+	xfs_inofree_t		__skipped_inomask;
 	struct xfs_inode	*__inodes[XFS_INODES_PER_CHUNK];
 };
 
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 896910be3173b..e1dcd5f6ce5dd 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -1145,6 +1145,7 @@ DEFINE_EVENT(xchk_iscan_class, name, \
 	TP_ARGS(iscan))
 DEFINE_ISCAN_EVENT(xchk_iscan_move_cursor);
 DEFINE_ISCAN_EVENT(xchk_iscan_visit);
+DEFINE_ISCAN_EVENT(xchk_iscan_skip);
 DEFINE_ISCAN_EVENT(xchk_iscan_advance_ag);
 
 DECLARE_EVENT_CLASS(xchk_iscan_ino_class,
@@ -1202,25 +1203,37 @@ TRACE_EVENT(xchk_iscan_iget,
 
 TRACE_EVENT(xchk_iscan_iget_batch,
 	TP_PROTO(struct xfs_mount *mp, struct xchk_iscan *iscan,
-		 unsigned int nr),
-	TP_ARGS(mp, iscan, nr),
+		 unsigned int nr, unsigned int avail),
+	TP_ARGS(mp, iscan, nr, avail),
 	TP_STRUCT__entry(
 		__field(dev_t, dev)
 		__field(xfs_ino_t, cursor)
 		__field(xfs_ino_t, visited)
 		__field(unsigned int, nr)
+		__field(unsigned int, avail)
+		__field(unsigned int, unavail)
+		__field(xfs_ino_t, batch_ino)
+		__field(unsigned long long, skipmask)
 	),
 	TP_fast_assign(
 		__entry->dev = mp->m_super->s_dev;
 		__entry->cursor = iscan->cursor_ino;
 		__entry->visited = iscan->__visited_ino;
 		__entry->nr = nr;
+		__entry->avail = avail;
+		__entry->unavail = hweight64(iscan->__skipped_inomask);
+		__entry->batch_ino = iscan->__batch_ino;
+		__entry->skipmask = iscan->__skipped_inomask;
 	),
-	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx nr %d",
+	TP_printk("dev %d:%d iscan cursor 0x%llx visited 0x%llx batchino 0x%llx skipmask 0x%llx nr %u avail %u unavail %u",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->cursor,
 		  __entry->visited,
-		  __entry->nr)
+		  __entry->batch_ino,
+		  __entry->skipmask,
+		  __entry->nr,
+		  __entry->avail,
+		  __entry->unavail)
 );
 
 TRACE_EVENT(xchk_iscan_iget_retry_wait,





[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux