[PATCH 12/16] xfs: parallelize inode inactivation

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Darrick J. Wong <djwong@xxxxxxxxxx>

Split the inode inactivation work into per-AG work items so that we can
take advantage of parallelization.

Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx>
---
 fs/xfs/libxfs/xfs_ag.c |    3 +
 fs/xfs/libxfs/xfs_ag.h |    3 +
 fs/xfs/xfs_icache.c    |  101 ++++++++++++++++++++++++++++++++++++++----------
 fs/xfs/xfs_mount.c     |   11 +++--
 fs/xfs/xfs_mount.h     |    2 -
 fs/xfs/xfs_super.c     |    1 
 fs/xfs/xfs_trace.h     |    8 ++--
 7 files changed, 97 insertions(+), 32 deletions(-)


diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index 0765a0ba30e1..7652d90d7d0d 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -173,6 +173,7 @@ __xfs_free_perag(
 	struct xfs_perag *pag = container_of(head, struct xfs_perag, rcu_head);
 
 	ASSERT(!delayed_work_pending(&pag->pag_blockgc_work));
+	ASSERT(!delayed_work_pending(&pag->pag_inodegc_work));
 	ASSERT(atomic_read(&pag->pag_ref) == 0);
 	kmem_free(pag);
 }
@@ -195,6 +196,7 @@ xfs_free_perag(
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 
 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
 		xfs_iunlink_destroy(pag);
 		xfs_buf_hash_destroy(pag);
 
@@ -253,6 +255,7 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pagb_lock);
 		spin_lock_init(&pag->pag_state_lock);
 		INIT_DELAYED_WORK(&pag->pag_blockgc_work, xfs_blockgc_worker);
+		INIT_DELAYED_WORK(&pag->pag_inodegc_work, xfs_inodegc_worker);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		init_waitqueue_head(&pag->pagb_wait);
 		pag->pagb_count = 0;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 4c6f9045baca..3929ea35b0d4 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -96,6 +96,9 @@ struct xfs_perag {
 	/* background prealloc block trimming */
 	struct delayed_work	pag_blockgc_work;
 
+	/* background inode inactivation */
+	struct delayed_work	pag_inodegc_work;
+
 	/*
 	 * Unlinked inode information.  This incore information reflects
 	 * data stored in the AGI, so callers must hold the AGI buffer lock
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 210a9e3cd19e..f58d0455e38f 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -299,6 +299,43 @@ xfs_worker_delay_freesp(
 	return delay_ms >> shift;
 }
 
+/*
+ * Scale down the background work delay if we're low on free space in this AG.
+ * Similar to the way that we throttle preallocations, we halve the delay time
+ * for every low free space threshold that isn't met.  Return value is in ms.
+ */
+static inline unsigned int
+xfs_work_delay_perag(
+	struct xfs_perag	*pag,
+	unsigned int		delay_ms)
+{
+	struct xfs_mount	*mp = pag->pag_mount;
+	xfs_extlen_t		freesp;
+	unsigned int		shift = 0;
+
+	if (!pag->pagf_init)
+		return delay_ms;
+
+	/* Free space in this AG that can be allocated to file data */
+	freesp = pag->pagf_freeblks + pag->pagf_flcount;
+	freesp -= (pag->pag_meta_resv.ar_reserved +
+		   pag->pag_rmapbt_resv.ar_reserved);
+
+	if (freesp < mp->m_ag_low_space[XFS_LOWSP_5_PCNT]) {
+		shift = 2;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_4_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_3_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_2_PCNT])
+			shift++;
+		if (freesp < mp->m_ag_low_space[XFS_LOWSP_1_PCNT])
+			shift++;
+	}
+
+	return delay_ms >> shift;
+}
+
 /*
  * Compute the lag between scheduling and executing background work based on
  * free space in the filesystem.  If an inode is passed in, its dquots will
@@ -306,18 +343,20 @@ xfs_worker_delay_freesp(
  */
 static inline unsigned int
 xfs_worker_delay_ms(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip,
 	unsigned int		default_ms)
 {
-	unsigned int		udelay, gdelay, pdelay, fdelay;
+	struct xfs_mount	*mp = pag->pag_mount;
+	unsigned int		udelay, gdelay, pdelay, fdelay, adelay;
 
 	udelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_USER, default_ms);
 	gdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_GROUP, default_ms);
 	pdelay = xfs_worker_delay_dquot(ip, XFS_DQTYPE_PROJ, default_ms);
 	fdelay = xfs_worker_delay_freesp(mp, default_ms);
+	adelay = xfs_work_delay_perag(pag, default_ms);
 
-	return min(min(udelay, gdelay), min(pdelay, fdelay));
+	return min(adelay, min(min(udelay, gdelay), min(pdelay, fdelay)));
 }
 
 /*
@@ -343,9 +382,11 @@ xfs_blockgc_queue(
  */
 static void
 xfs_inodegc_queue(
-	struct xfs_mount        *mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
 {
+	struct xfs_mount        *mp = pag->pag_mount;
+
 	if (!test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
@@ -353,9 +394,9 @@ xfs_inodegc_queue(
 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) {
 		unsigned int	delay;
 
-		delay = xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms);
-		trace_xfs_inodegc_queue(mp, delay, _RET_IP_);
-		queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work,
+		delay = xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms);
+		trace_xfs_inodegc_queue(mp, pag->pag_agno, delay, _RET_IP_);
+		queue_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work,
 				msecs_to_jiffies(delay));
 	}
 	rcu_read_unlock();
@@ -367,11 +408,13 @@ xfs_inodegc_queue(
  */
 static void
 xfs_inodegc_queue_sooner(
-	struct xfs_mount	*mp,
+	struct xfs_perag	*pag,
 	struct xfs_inode	*ip)
 {
+	struct xfs_mount	*mp = pag->pag_mount;
+
 	if (!XFS_IS_QUOTA_ON(mp) ||
-	    !delayed_work_pending(&mp->m_inodegc_work) ||
+	    !delayed_work_pending(&pag->pag_inodegc_work) ||
 	    !test_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
@@ -379,11 +422,11 @@ xfs_inodegc_queue_sooner(
 	if (!radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG))
 		goto unlock;
 
-	if (xfs_worker_delay_ms(mp, ip, xfs_inodegc_ms) == xfs_inodegc_ms)
+	if (xfs_worker_delay_ms(pag, ip, xfs_inodegc_ms) == xfs_inodegc_ms)
 		goto unlock;
 
-	trace_xfs_inodegc_queue(mp, 0, _RET_IP_);
-	queue_delayed_work(mp->m_gc_workqueue, &mp->m_inodegc_work, 0);
+	trace_xfs_inodegc_queue(mp, pag->pag_agno, 0, _RET_IP_);
+	mod_delayed_work(mp->m_gc_workqueue, &pag->pag_inodegc_work, 0);
 unlock:
 	rcu_read_unlock();
 }
@@ -427,7 +470,7 @@ xfs_perag_set_inode_tag(
 		xfs_blockgc_queue(pag);
 		break;
 	case XFS_ICI_INODEGC_TAG:
-		xfs_inodegc_queue(mp, ip);
+		xfs_inodegc_queue(pag, ip);
 		break;
 	}
 
@@ -561,7 +604,7 @@ xfs_inode_mark_reclaimable(
 	spin_unlock(&pag->pag_ici_lock);
 
 	if (need_inactive && already_queued)
-		xfs_inodegc_queue_sooner(mp, ip);
+		xfs_inodegc_queue_sooner(pag, ip);
 
 	xfs_perag_put(pag);
 }
@@ -2058,16 +2101,17 @@ void
 xfs_inodegc_worker(
 	struct work_struct	*work)
 {
-	struct xfs_mount	*mp = container_of(to_delayed_work(work),
-					struct xfs_mount, m_inodegc_work);
+	struct xfs_perag	*pag = container_of(to_delayed_work(work),
+					struct xfs_perag, pag_inodegc_work);
+	struct xfs_mount	*mp = pag->pag_mount;
 
 	/*
 	 * Inactivation never returns error codes and never fails to push a
 	 * tagged inode to reclaim.  Loop until there there's nothing left.
 	 */
-	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_INODEGC_TAG)) {
-		trace_xfs_inodegc_worker(mp, 0, _RET_IP_);
-		xfs_icwalk(mp, XFS_ICWALK_INODEGC, NULL);
+	while (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_INODEGC_TAG)) {
+		trace_xfs_inodegc_worker(mp, pag->pag_agno, 0, _RET_IP_);
+		xfs_icwalk_ag(pag, XFS_ICWALK_INODEGC, NULL);
 	}
 }
 
@@ -2079,8 +2123,13 @@ void
 xfs_inodegc_flush(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	trace_xfs_inodegc_flush(mp, 0, _RET_IP_);
-	flush_delayed_work(&mp->m_inodegc_work);
+
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG)
+		flush_delayed_work(&pag->pag_inodegc_work);
 }
 
 /* Disable the inode inactivation background worker and wait for it to stop. */
@@ -2088,10 +2137,14 @@ void
 xfs_inodegc_stop(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	if (!test_and_clear_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
-	cancel_delayed_work_sync(&mp->m_inodegc_work);
+	for_each_perag(mp, agno, pag)
+		cancel_delayed_work_sync(&pag->pag_inodegc_work);
 	trace_xfs_inodegc_stop(mp, 0, _RET_IP_);
 }
 
@@ -2103,11 +2156,15 @@ void
 xfs_inodegc_start(
 	struct xfs_mount	*mp)
 {
+	struct xfs_perag	*pag;
+	xfs_agnumber_t		agno;
+
 	if (test_and_set_bit(XFS_OPFLAG_INODEGC_RUNNING_BIT, &mp->m_opflags))
 		return;
 
 	trace_xfs_inodegc_start(mp, 0, _RET_IP_);
-	xfs_inodegc_queue(mp, NULL);
+	for_each_perag_tag(mp, agno, pag, XFS_ICI_INODEGC_TAG)
+		xfs_inodegc_queue(pag, NULL);
 }
 
 /* XFS Inode Cache Walking Code */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index ab65a14e51e6..eff375f92005 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -365,13 +365,16 @@ void
 xfs_set_low_space_thresholds(
 	struct xfs_mount	*mp)
 {
-	int i;
+	uint64_t		space = mp->m_sb.sb_dblocks;
+	uint32_t		ag_space = mp->m_sb.sb_agblocks;
+	int			i;
+
+	do_div(space, 100);
+	do_div(ag_space, 100);
 
 	for (i = 0; i < XFS_LOWSP_MAX; i++) {
-		uint64_t space = mp->m_sb.sb_dblocks;
-
-		do_div(space, 100);
 		mp->m_low_space[i] = space * (i + 1);
+		mp->m_ag_low_space[i] = ag_space * (i + 1);
 	}
 }
 
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dc906b78e24c..154aa95d968c 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -131,6 +131,7 @@ typedef struct xfs_mount {
 	uint			m_rsumsize;	/* size of rt summary, bytes */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_qflags;	/* quota status flags */
+	int32_t			m_ag_low_space[XFS_LOWSP_MAX];
 	uint64_t		m_flags;	/* global mount flags */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 	struct xfs_ino_geometry	m_ino_geo;	/* inode geometry */
@@ -191,7 +192,6 @@ typedef struct xfs_mount {
 	uint64_t		m_resblks_avail;/* available reserved blocks */
 	uint64_t		m_resblks_save;	/* reserved blks @ remount,ro */
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
-	struct delayed_work	m_inodegc_work; /* background inode inactive */
 	struct xfs_kobj		m_kobj;
 	struct xfs_kobj		m_error_kobj;
 	struct xfs_kobj		m_error_meta_kobj;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 45ef63b5b2f0..66b61d38f401 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1865,7 +1865,6 @@ static int xfs_init_fs_context(
 	mutex_init(&mp->m_growlock);
 	INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-	INIT_DELAYED_WORK(&mp->m_inodegc_work, xfs_inodegc_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 	/*
 	 * We don't create the finobt per-ag space reservation until after log
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ca9bfbd28886..404f2f32002f 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -122,7 +122,7 @@ TRACE_EVENT(xlog_intent_recovery_failed,
 		  __entry->error, __entry->function)
 );
 
-DECLARE_EVENT_CLASS(xfs_perag_class,
+DECLARE_EVENT_CLASS(xfs_perag_ref_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
 		 unsigned long caller_ip),
 	TP_ARGS(mp, agno, refcount, caller_ip),
@@ -146,7 +146,7 @@ DECLARE_EVENT_CLASS(xfs_perag_class,
 );
 
 #define DEFINE_PERAG_REF_EVENT(name)	\
-DEFINE_EVENT(xfs_perag_class, name,	\
+DEFINE_EVENT(xfs_perag_ref_class, name,	\
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,	\
 		 unsigned long caller_ip),					\
 	TP_ARGS(mp, agno, refcount, caller_ip))
@@ -155,6 +155,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_inode_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_inode_tag);
+DEFINE_PERAG_REF_EVENT(xfs_inodegc_queue);
+DEFINE_PERAG_REF_EVENT(xfs_inodegc_worker);
 
 DECLARE_EVENT_CLASS(xfs_fs_class,
 	TP_PROTO(struct xfs_mount *mp, int data, unsigned long caller_ip),
@@ -191,8 +193,6 @@ DEFINE_EVENT(xfs_fs_class, name,					\
 DEFINE_FS_EVENT(xfs_inodegc_flush);
 DEFINE_FS_EVENT(xfs_inodegc_start);
 DEFINE_FS_EVENT(xfs_inodegc_stop);
-DEFINE_FS_EVENT(xfs_inodegc_queue);
-DEFINE_FS_EVENT(xfs_inodegc_worker);
 DEFINE_FS_EVENT(xfs_fs_sync_fs);
 
 DECLARE_EVENT_CLASS(xfs_ag_class,




[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux