[PATCH 03/16] [RFC] xfs: use generic per-cpu counter infrastructure

Dave Chinner <david@xxxxxxxxxxxxx> · Mon, 8 Nov 2010 19:55:06 +1100

From: Dave Chinner <dchinner@xxxxxxxxxx>

XFS has a per-cpu counter implementation for in-core superblock
counters that pre-dated the generic implementation. It is complex
and baroque as it is tailored directly to the needs of ENOSPC
detection. Implement the complex accurate-compare-and-add
calculation in the generic per-cpu counter code and convert the
XFS counters to use the much simpler generic counter code.

Passes xfsqa on SMP system.

Still to do:

	1. UP build and test.
	2. split into separate patches

For discussion:
	1. kill the no-per-cpu-counter mode?
	2. does we need a custom batch size?
	3. do we need to factor xfs_mod_sb_incore()?
	4. should all the readers just sum the counters themselves
	   and kill the wrapperÑ?

Signed-off-by: Dave Chinner <dchinner@xxxxxxxxxx>
---
 fs/xfs/linux-2.6/xfs_linux.h   |    9 -
 fs/xfs/linux-2.6/xfs_super.c   |    4 +-
 fs/xfs/xfs_fsops.c             |    4 +-
 fs/xfs/xfs_mount.c             |  834 +++++++++++-----------------------------
 fs/xfs/xfs_mount.h             |   80 +---
 include/linux/percpu_counter.h |   16 +
 lib/percpu_counter.c           |   79 ++++
 7 files changed, 341 insertions(+), 685 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 214ddd7..9fa4f2a 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -88,15 +88,6 @@
 #include <xfs_super.h>
 #include <xfs_buf.h>
 
-/*
- * Feature macros (disable/enable)
- */
-#ifdef CONFIG_SMP
-#define HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
-#else
-#undef  HAVE_PERCPU_SB	/* per cpu superblock counters are a 2.6 feature */
-#endif
-
 #define irix_sgid_inherit	xfs_params.sgid_inherit.val
 #define irix_symlink_mode	xfs_params.symlink_mode.val
 #define xfs_panic_mask		xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 53ab47f..fa789b7 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1230,9 +1230,9 @@ xfs_fs_statfs(
 	statp->f_fsid.val[0] = (u32)id;
 	statp->f_fsid.val[1] = (u32)(id >> 32);
 
-	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
-
 	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_sync_counters_locked(mp);
+
 	statp->f_bsize = sbp->sb_blocksize;
 	lsize = sbp->sb_logstart ? sbp->sb_logblocks : 0;
 	statp->f_blocks = sbp->sb_dblocks - lsize;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index a7c116e..44ecf1b 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -478,7 +478,7 @@ xfs_fs_counts(
 	xfs_mount_t		*mp,
 	xfs_fsop_counts_t	*cnt)
 {
-	xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT);
+	xfs_icsb_sync_counters(mp);
 	spin_lock(&mp->m_sb_lock);
 	cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 	cnt->freertx = mp->m_sb.sb_frextents;
@@ -540,7 +540,7 @@ xfs_reserve_blocks(
 	 */
 retry:
 	spin_lock(&mp->m_sb_lock);
-	xfs_icsb_sync_counters_locked(mp, 0);
+	xfs_icsb_sync_counters_locked(mp);
 
 	/*
 	 * If our previous reservation was larger than the current value,
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 19e9dfa..0d9a030 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -46,19 +46,6 @@
 
 STATIC void	xfs_unmountfs_wait(xfs_mount_t *);
 
-
-#ifdef HAVE_PERCPU_SB
-STATIC void	xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
-						int);
-STATIC void	xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t,
-						int);
-STATIC void	xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
-#else
-
-#define xfs_icsb_balance_counter(mp, a, b)		do { } while (0)
-#define xfs_icsb_balance_counter_locked(mp, a, b)	do { } while (0)
-#endif
-
 static const struct {
 	short offset;
 	short type;	/* 0 = integer
@@ -280,6 +267,111 @@ xfs_free_perag(
 	}
 }
 
+
+/*
+ * Per-cpu incore superblock counters
+ *
+ * Simple concept, difficult implementation, now somewhat simplified by generic
+ * per-cpu counter support.  This provides distributed per cpu counters for
+ * contended fields (e.g.  free block count).
+ *
+ * Difficulties arise in that the incore sb is used for ENOSPC checking, and
+ * hence needs to be accurately read when we are running low on space. Hence We
+ * need to check against counter error bounds and determine how accurately to
+ * sum based on that metric. The percpu counters take care of this for us,
+ * so we only need to modify the fast path to handle per-cpu counter error
+ * cases.
+ */
+static inline int
+xfs_icsb_add(
+	struct xfs_mount	*mp,
+	int			counter,
+	int64_t			delta,
+	int64_t			threshold)
+{
+	int			ret;
+
+	ret = percpu_counter_add_unless_lt(&mp->m_icsb[counter], delta,
+								threshold);
+	if (ret < 0)
+		return -ENOSPC;
+	return 0;
+}
+
+static inline void
+xfs_icsb_set(
+	struct xfs_mount	*mp,
+	int			counter,
+	int64_t			value)
+{
+	percpu_counter_set(&mp->m_icsb[counter], value);
+}
+
+static inline int64_t
+xfs_icsb_sum(
+	struct xfs_mount	*mp,
+	int			counter)
+{
+	return percpu_counter_sum_positive(&mp->m_icsb[counter]);
+}
+
+static inline int64_t
+xfs_icsb_read(
+	struct xfs_mount	*mp,
+	int			counter)
+{
+	return percpu_counter_read_positive(&mp->m_icsb[counter]);
+}
+
+void
+xfs_icsb_reinit_counters(
+	struct xfs_mount	*mp)
+{
+	xfs_icsb_set(mp, XFS_ICSB_FDBLOCKS, mp->m_sb.sb_fdblocks);
+	xfs_icsb_set(mp, XFS_ICSB_IFREE, mp->m_sb.sb_ifree);
+	xfs_icsb_set(mp, XFS_ICSB_ICOUNT, mp->m_sb.sb_icount);
+}
+
+int
+xfs_icsb_init_counters(
+	struct xfs_mount	*mp)
+{
+	int			i;
+	int			error;
+
+	for (i = 0; i < XFS_ICSB_MAX; i++) {
+		error = percpu_counter_init(&mp->m_icsb[i], 0);
+		if (error)
+			goto out_error;
+	}
+	xfs_icsb_reinit_counters(mp);
+	return 0;
+
+out_error:
+	for (; i >= 0; i--)
+		percpu_counter_destroy(&mp->m_icsb[i]);
+	return error;
+}
+
+void
+xfs_icsb_destroy_counters(
+	xfs_mount_t	*mp)
+{
+	int		i;
+
+	for (i = 0; i < XFS_ICSB_MAX; i++)
+		percpu_counter_destroy(&mp->m_icsb[i]);
+}
+
+void
+xfs_icsb_sync_counters_locked(
+	xfs_mount_t	*mp)
+{
+	mp->m_sb.sb_icount = xfs_icsb_sum(mp, XFS_ICSB_ICOUNT);
+	mp->m_sb.sb_ifree = xfs_icsb_sum(mp, XFS_ICSB_IFREE);
+	mp->m_sb.sb_fdblocks = xfs_icsb_sum(mp, XFS_ICSB_FDBLOCKS);
+}
+
 /*
  * Check size of device based on the (data/realtime) block count.
  * Note: this check is used by the growfs code as well as mount.
@@ -1562,7 +1654,7 @@ xfs_log_sbcount(
 	if (!xfs_fs_writable(mp))
 		return 0;
 
-	xfs_icsb_sync_counters(mp, 0);
+	xfs_icsb_sync_counters(mp);
 
 	/*
 	 * we don't need to do this if we are updating the superblock
@@ -1674,9 +1766,9 @@ xfs_mod_incore_sb_unlocked(
 	int64_t		delta,
 	int		rsvd)
 {
-	int		scounter;	/* short counter for 32 bit fields */
-	long long	lcounter;	/* long counter for 64 bit fields */
-	long long	res_used, rem;
+	int		scounter = 0;	/* short counter for 32 bit fields */
+	long long	lcounter = 0;	/* long counter for 64 bit fields */
+	long long	res_used;
 
 	/*
 	 * With the in-core superblock spin lock held, switch
@@ -1708,43 +1800,45 @@ xfs_mod_incore_sb_unlocked(
 			mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
 		res_used = (long long)(mp->m_resblks - mp->m_resblks_avail);
 
-		if (delta > 0) {		/* Putting blocks back */
+		/*
+		 * if we are putting blocks back, put them into the reserve
+		 * block pool first.
+		 */
+		if (res_used && delta > 0) {
 			if (res_used > delta) {
 				mp->m_resblks_avail += delta;
+				delta = 0;
 			} else {
-				rem = delta - res_used;
 				mp->m_resblks_avail = mp->m_resblks;
-				lcounter += rem;
+				delta -= res_used;
 			}
-		} else {				/* Taking blocks away */
-			lcounter += delta;
-			if (lcounter >= 0) {
-				mp->m_sb.sb_fdblocks = lcounter +
-							XFS_ALLOC_SET_ASIDE(mp);
+			if (!delta)
 				return 0;
-			}
+		}
 
-			/*
-			 * We are out of blocks, use any available reserved
-			 * blocks if were allowed to.
-			 */
-			if (!rsvd)
-				return XFS_ERROR(ENOSPC);
+		lcounter += delta;
+		if (likely(lcounter >= 0)) {
+			mp->m_sb.sb_fdblocks = lcounter +
+						XFS_ALLOC_SET_ASIDE(mp);
+			return 0;
+		}
 
-			lcounter = (long long)mp->m_resblks_avail + delta;
-			if (lcounter >= 0) {
-				mp->m_resblks_avail = lcounter;
-				return 0;
-			}
-			printk_once(KERN_WARNING
-				"Filesystem \"%s\": reserve blocks depleted! "
-				"Consider increasing reserve pool size.",
-				mp->m_fsname);
+		/* ENOSPC */
+		ASSERT(delta < 0);
+		if (!rsvd)
 			return XFS_ERROR(ENOSPC);
+
+		lcounter = (long long)mp->m_resblks_avail + delta;
+		if (lcounter >= 0) {
+			mp->m_resblks_avail = lcounter;
+			return 0;
 		}
+		printk_once(KERN_WARNING
+			"Filesystem \"%s\": reserve blocks depleted! "
+			"Consider increasing reserve pool size.",
+			mp->m_fsname);
+		return XFS_ERROR(ENOSPC);
 
-		mp->m_sb.sb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-		return 0;
 	case XFS_SBS_FREXTENTS:
 		lcounter = (long long)mp->m_sb.sb_frextents;
 		lcounter += delta;
@@ -1846,9 +1940,7 @@ xfs_mod_incore_sb(
 {
 	int			status;
 
-#ifdef HAVE_PERCPU_SB
 	ASSERT(field < XFS_SBS_ICOUNT || field > XFS_SBS_FDBLOCKS);
-#endif
 	spin_lock(&mp->m_sb_lock);
 	status = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
 	spin_unlock(&mp->m_sb_lock);
@@ -1907,6 +1999,89 @@ unwind:
 	return error;
 }
 
+int
+xfs_icsb_modify_counters(
+	xfs_mount_t	*mp,
+	xfs_sb_field_t	field,
+	int64_t		delta,
+	int		rsvd)
+{
+	int64_t		lcounter;
+	int64_t		res_used;
+	int		ret = 0;
+
+
+	switch (field) {
+	case XFS_SBS_ICOUNT:
+		ret = xfs_icsb_add(mp, XFS_ICSB_ICOUNT, delta, 0);
+		if (ret < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		return 0;
+
+	case XFS_SBS_IFREE:
+		ret = xfs_icsb_add(mp, XFS_ICSB_IFREE, delta, 0);
+		if (ret < 0) {
+			ASSERT(0);
+			return XFS_ERROR(EINVAL);
+		}
+		return 0;
+
+	case XFS_SBS_FDBLOCKS:
+		/*
+		 * if we are putting blocks back, put them into the reserve
+		 * block pool first.
+		 */
+		if (mp->m_resblks != mp->m_resblks_avail && delta > 0) {
+			spin_lock(&mp->m_sb_lock);
+			res_used = (int64_t)(mp->m_resblks -
+						mp->m_resblks_avail);
+			if (res_used > delta) {
+				mp->m_resblks_avail += delta;
+				delta = 0;
+			} else {
+				delta -= res_used;
+				mp->m_resblks_avail = mp->m_resblks;
+			}
+			spin_unlock(&mp->m_sb_lock);
+			if (!delta)
+				return 0;
+		}
+
+		/* try the change */
+		ret = xfs_icsb_add(mp, XFS_ICSB_FDBLOCKS, delta,
+						XFS_ALLOC_SET_ASIDE(mp));
+		if (likely(ret >= 0))
+			return 0;
+
+		/* ENOSPC */
+		ASSERT(ret == -ENOSPC);
+		ASSERT(delta < 0);
+
+		if (!rsvd)
+			return XFS_ERROR(ENOSPC);
+
+		spin_lock(&mp->m_sb_lock);
+		lcounter = (int64_t)mp->m_resblks_avail + delta;
+		if (lcounter >= 0) {
+			mp->m_resblks_avail = lcounter;
+			spin_unlock(&mp->m_sb_lock);
+			return 0;
+		}
+		spin_unlock(&mp->m_sb_lock);
+		printk_once(KERN_WARNING
+			"Filesystem \"%s\": reserve blocks depleted! "
+			"Consider increasing reserve pool size.",
+			mp->m_fsname);
+		return XFS_ERROR(ENOSPC);
+	default:
+		ASSERT(0);
+		return XFS_ERROR(EINVAL);
+	}
+	return 0;
+}
+
 /*
  * xfs_getsb() is called to obtain the buffer for the superblock.
  * The buffer is returned locked and read in from disk.
@@ -2000,572 +2175,3 @@ xfs_dev_is_read_only(
 	}
 	return 0;
 }
-
-#ifdef HAVE_PERCPU_SB
-/*
- * Per-cpu incore superblock counters
- *
- * Simple concept, difficult implementation
- *
- * Basically, replace the incore superblock counters with a distributed per cpu
- * counter for contended fields (e.g.  free block count).
- *
- * Difficulties arise in that the incore sb is used for ENOSPC checking, and
- * hence needs to be accurately read when we are running low on space. Hence
- * there is a method to enable and disable the per-cpu counters based on how
- * much "stuff" is available in them.
- *
- * Basically, a counter is enabled if there is enough free resource to justify
- * running a per-cpu fast-path. If the per-cpu counter runs out (i.e. a local
- * ENOSPC), then we disable the counters to synchronise all callers and
- * re-distribute the available resources.
- *
- * If, once we redistributed the available resources, we still get a failure,
- * we disable the per-cpu counter and go through the slow path.
- *
- * The slow path is the current xfs_mod_incore_sb() function.  This means that
- * when we disable a per-cpu counter, we need to drain its resources back to
- * the global superblock. We do this after disabling the counter to prevent
- * more threads from queueing up on the counter.
- *
- * Essentially, this means that we still need a lock in the fast path to enable
- * synchronisation between the global counters and the per-cpu counters. This
- * is not a problem because the lock will be local to a CPU almost all the time
- * and have little contention except when we get to ENOSPC conditions.
- *
- * Basically, this lock becomes a barrier that enables us to lock out the fast
- * path while we do things like enabling and disabling counters and
- * synchronising the counters.
- *
- * Locking rules:
- *
- * 	1. m_sb_lock before picking up per-cpu locks
- * 	2. per-cpu locks always picked up via for_each_online_cpu() order
- * 	3. accurate counter sync requires m_sb_lock + per cpu locks
- * 	4. modifying per-cpu counters requires holding per-cpu lock
- * 	5. modifying global counters requires holding m_sb_lock
- *	6. enabling or disabling a counter requires holding the m_sb_lock 
- *	   and _none_ of the per-cpu locks.
- *
- * Disabled counters are only ever re-enabled by a balance operation
- * that results in more free resources per CPU than a given threshold.
- * To ensure counters don't remain disabled, they are rebalanced when
- * the global resource goes above a higher threshold (i.e. some hysteresis
- * is present to prevent thrashing).
- */
-
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * hot-plug CPU notifier support.
- *
- * We need a notifier per filesystem as we need to be able to identify
- * the filesystem to balance the counters out. This is achieved by
- * having a notifier block embedded in the xfs_mount_t and doing pointer
- * magic to get the mount pointer from the notifier block address.
- */
-STATIC int
-xfs_icsb_cpu_notify(
-	struct notifier_block *nfb,
-	unsigned long action,
-	void *hcpu)
-{
-	xfs_icsb_cnts_t *cntp;
-	xfs_mount_t	*mp;
-
-	mp = (xfs_mount_t *)container_of(nfb, xfs_mount_t, m_icsb_notifier);
-	cntp = (xfs_icsb_cnts_t *)
-			per_cpu_ptr(mp->m_sb_cnts, (unsigned long)hcpu);
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		/* Easy Case - initialize the area and locks, and
-		 * then rebalance when online does everything else for us. */
-		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-		break;
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		xfs_icsb_lock(mp);
-		xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-		xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-		xfs_icsb_unlock(mp);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		/* Disable all the counters, then fold the dead cpu's
-		 * count into the total on the global superblock and
-		 * re-enable the counters. */
-		xfs_icsb_lock(mp);
-		spin_lock(&mp->m_sb_lock);
-		xfs_icsb_disable_counter(mp, XFS_SBS_ICOUNT);
-		xfs_icsb_disable_counter(mp, XFS_SBS_IFREE);
-		xfs_icsb_disable_counter(mp, XFS_SBS_FDBLOCKS);
-
-		mp->m_sb.sb_icount += cntp->icsb_icount;
-		mp->m_sb.sb_ifree += cntp->icsb_ifree;
-		mp->m_sb.sb_fdblocks += cntp->icsb_fdblocks;
-
-		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-
-		xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0);
-		xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0);
-		xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0);
-		spin_unlock(&mp->m_sb_lock);
-		xfs_icsb_unlock(mp);
-		break;
-	}
-
-	return NOTIFY_OK;
-}
-#endif /* CONFIG_HOTPLUG_CPU */
-
-int
-xfs_icsb_init_counters(
-	xfs_mount_t	*mp)
-{
-	xfs_icsb_cnts_t *cntp;
-	int		i;
-
-	mp->m_sb_cnts = alloc_percpu(xfs_icsb_cnts_t);
-	if (mp->m_sb_cnts == NULL)
-		return -ENOMEM;
-
-#ifdef CONFIG_HOTPLUG_CPU
-	mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify;
-	mp->m_icsb_notifier.priority = 0;
-	register_hotcpu_notifier(&mp->m_icsb_notifier);
-#endif /* CONFIG_HOTPLUG_CPU */
-
-	for_each_online_cpu(i) {
-		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-		memset(cntp, 0, sizeof(xfs_icsb_cnts_t));
-	}
-
-	mutex_init(&mp->m_icsb_mutex);
-
-	/*
-	 * start with all counters disabled so that the
-	 * initial balance kicks us off correctly
-	 */
-	mp->m_icsb_counters = -1;
-	return 0;
-}
-
-void
-xfs_icsb_reinit_counters(
-	xfs_mount_t	*mp)
-{
-	xfs_icsb_lock(mp);
-	/*
-	 * start with all counters disabled so that the
-	 * initial balance kicks us off correctly
-	 */
-	mp->m_icsb_counters = -1;
-	xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0);
-	xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0);
-	xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0);
-	xfs_icsb_unlock(mp);
-}
-
-void
-xfs_icsb_destroy_counters(
-	xfs_mount_t	*mp)
-{
-	if (mp->m_sb_cnts) {
-		unregister_hotcpu_notifier(&mp->m_icsb_notifier);
-		free_percpu(mp->m_sb_cnts);
-	}
-	mutex_destroy(&mp->m_icsb_mutex);
-}
-
-STATIC void
-xfs_icsb_lock_cntr(
-	xfs_icsb_cnts_t	*icsbp)
-{
-	while (test_and_set_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags)) {
-		ndelay(1000);
-	}
-}
-
-STATIC void
-xfs_icsb_unlock_cntr(
-	xfs_icsb_cnts_t	*icsbp)
-{
-	clear_bit(XFS_ICSB_FLAG_LOCK, &icsbp->icsb_flags);
-}
-
-
-STATIC void
-xfs_icsb_lock_all_counters(
-	xfs_mount_t	*mp)
-{
-	xfs_icsb_cnts_t *cntp;
-	int		i;
-
-	for_each_online_cpu(i) {
-		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-		xfs_icsb_lock_cntr(cntp);
-	}
-}
-
-STATIC void
-xfs_icsb_unlock_all_counters(
-	xfs_mount_t	*mp)
-{
-	xfs_icsb_cnts_t *cntp;
-	int		i;
-
-	for_each_online_cpu(i) {
-		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-		xfs_icsb_unlock_cntr(cntp);
-	}
-}
-
-STATIC void
-xfs_icsb_count(
-	xfs_mount_t	*mp,
-	xfs_icsb_cnts_t	*cnt,
-	int		flags)
-{
-	xfs_icsb_cnts_t *cntp;
-	int		i;
-
-	memset(cnt, 0, sizeof(xfs_icsb_cnts_t));
-
-	if (!(flags & XFS_ICSB_LAZY_COUNT))
-		xfs_icsb_lock_all_counters(mp);
-
-	for_each_online_cpu(i) {
-		cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i);
-		cnt->icsb_icount += cntp->icsb_icount;
-		cnt->icsb_ifree += cntp->icsb_ifree;
-		cnt->icsb_fdblocks += cntp->icsb_fdblocks;
-	}
-
-	if (!(flags & XFS_ICSB_LAZY_COUNT))
-		xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC int
-xfs_icsb_counter_disabled(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field)
-{
-	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-	return test_bit(field, &mp->m_icsb_counters);
-}
-
-STATIC void
-xfs_icsb_disable_counter(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field)
-{
-	xfs_icsb_cnts_t	cnt;
-
-	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
-	/*
-	 * If we are already disabled, then there is nothing to do
-	 * here. We check before locking all the counters to avoid
-	 * the expensive lock operation when being called in the
-	 * slow path and the counter is already disabled. This is
-	 * safe because the only time we set or clear this state is under
-	 * the m_icsb_mutex.
-	 */
-	if (xfs_icsb_counter_disabled(mp, field))
-		return;
-
-	xfs_icsb_lock_all_counters(mp);
-	if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
-		/* drain back to superblock */
-
-		xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT);
-		switch(field) {
-		case XFS_SBS_ICOUNT:
-			mp->m_sb.sb_icount = cnt.icsb_icount;
-			break;
-		case XFS_SBS_IFREE:
-			mp->m_sb.sb_ifree = cnt.icsb_ifree;
-			break;
-		case XFS_SBS_FDBLOCKS:
-			mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-			break;
-		default:
-			BUG();
-		}
-	}
-
-	xfs_icsb_unlock_all_counters(mp);
-}
-
-STATIC void
-xfs_icsb_enable_counter(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field,
-	uint64_t	count,
-	uint64_t	resid)
-{
-	xfs_icsb_cnts_t	*cntp;
-	int		i;
-
-	ASSERT((field >= XFS_SBS_ICOUNT) && (field <= XFS_SBS_FDBLOCKS));
-
-	xfs_icsb_lock_all_counters(mp);
-	for_each_online_cpu(i) {
-		cntp = per_cpu_ptr(mp->m_sb_cnts, i);
-		switch (field) {
-		case XFS_SBS_ICOUNT:
-			cntp->icsb_icount = count + resid;
-			break;
-		case XFS_SBS_IFREE:
-			cntp->icsb_ifree = count + resid;
-			break;
-		case XFS_SBS_FDBLOCKS:
-			cntp->icsb_fdblocks = count + resid;
-			break;
-		default:
-			BUG();
-			break;
-		}
-		resid = 0;
-	}
-	clear_bit(field, &mp->m_icsb_counters);
-	xfs_icsb_unlock_all_counters(mp);
-}
-
-void
-xfs_icsb_sync_counters_locked(
-	xfs_mount_t	*mp,
-	int		flags)
-{
-	xfs_icsb_cnts_t	cnt;
-
-	xfs_icsb_count(mp, &cnt, flags);
-
-	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT))
-		mp->m_sb.sb_icount = cnt.icsb_icount;
-	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE))
-		mp->m_sb.sb_ifree = cnt.icsb_ifree;
-	if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS))
-		mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks;
-}
-
-/*
- * Accurate update of per-cpu counters to incore superblock
- */
-void
-xfs_icsb_sync_counters(
-	xfs_mount_t	*mp,
-	int		flags)
-{
-	spin_lock(&mp->m_sb_lock);
-	xfs_icsb_sync_counters_locked(mp, flags);
-	spin_unlock(&mp->m_sb_lock);
-}
-
-/*
- * Balance and enable/disable counters as necessary.
- *
- * Thresholds for re-enabling counters are somewhat magic.  inode counts are
- * chosen to be the same number as single on disk allocation chunk per CPU, and
- * free blocks is something far enough zero that we aren't going thrash when we
- * get near ENOSPC. We also need to supply a minimum we require per cpu to
- * prevent looping endlessly when xfs_alloc_space asks for more than will
- * be distributed to a single CPU but each CPU has enough blocks to be
- * reenabled.
- *
- * Note that we can be called when counters are already disabled.
- * xfs_icsb_disable_counter() optimises the counter locking in this case to
- * prevent locking every per-cpu counter needlessly.
- */
-
-#define XFS_ICSB_INO_CNTR_REENABLE	(uint64_t)64
-#define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \
-		(uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp))
-STATIC void
-xfs_icsb_balance_counter_locked(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t  field,
-	int		min_per_cpu)
-{
-	uint64_t	count, resid;
-	int		weight = num_online_cpus();
-	uint64_t	min = (uint64_t)min_per_cpu;
-
-	/* disable counter and sync counter */
-	xfs_icsb_disable_counter(mp, field);
-
-	/* update counters  - first CPU gets residual*/
-	switch (field) {
-	case XFS_SBS_ICOUNT:
-		count = mp->m_sb.sb_icount;
-		resid = do_div(count, weight);
-		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-			return;
-		break;
-	case XFS_SBS_IFREE:
-		count = mp->m_sb.sb_ifree;
-		resid = do_div(count, weight);
-		if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE))
-			return;
-		break;
-	case XFS_SBS_FDBLOCKS:
-		count = mp->m_sb.sb_fdblocks;
-		resid = do_div(count, weight);
-		if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp)))
-			return;
-		break;
-	default:
-		BUG();
-		count = resid = 0;	/* quiet, gcc */
-		break;
-	}
-
-	xfs_icsb_enable_counter(mp, field, count, resid);
-}
-
-STATIC void
-xfs_icsb_balance_counter(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t  fields,
-	int		min_per_cpu)
-{
-	spin_lock(&mp->m_sb_lock);
-	xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu);
-	spin_unlock(&mp->m_sb_lock);
-}
-
-int
-xfs_icsb_modify_counters(
-	xfs_mount_t	*mp,
-	xfs_sb_field_t	field,
-	int64_t		delta,
-	int		rsvd)
-{
-	xfs_icsb_cnts_t	*icsbp;
-	long long	lcounter;	/* long counter for 64 bit fields */
-	int		ret = 0;
-
-	might_sleep();
-again:
-	preempt_disable();
-	icsbp = this_cpu_ptr(mp->m_sb_cnts);
-
-	/*
-	 * if the counter is disabled, go to slow path
-	 */
-	if (unlikely(xfs_icsb_counter_disabled(mp, field)))
-		goto slow_path;
-	xfs_icsb_lock_cntr(icsbp);
-	if (unlikely(xfs_icsb_counter_disabled(mp, field))) {
-		xfs_icsb_unlock_cntr(icsbp);
-		goto slow_path;
-	}
-
-	switch (field) {
-	case XFS_SBS_ICOUNT:
-		lcounter = icsbp->icsb_icount;
-		lcounter += delta;
-		if (unlikely(lcounter < 0))
-			goto balance_counter;
-		icsbp->icsb_icount = lcounter;
-		break;
-
-	case XFS_SBS_IFREE:
-		lcounter = icsbp->icsb_ifree;
-		lcounter += delta;
-		if (unlikely(lcounter < 0))
-			goto balance_counter;
-		icsbp->icsb_ifree = lcounter;
-		break;
-
-	case XFS_SBS_FDBLOCKS:
-		BUG_ON((mp->m_resblks - mp->m_resblks_avail) != 0);
-
-		lcounter = icsbp->icsb_fdblocks - XFS_ALLOC_SET_ASIDE(mp);
-		lcounter += delta;
-		if (unlikely(lcounter < 0))
-			goto balance_counter;
-		icsbp->icsb_fdblocks = lcounter + XFS_ALLOC_SET_ASIDE(mp);
-		break;
-	default:
-		BUG();
-		break;
-	}
-	xfs_icsb_unlock_cntr(icsbp);
-	preempt_enable();
-	return 0;
-
-slow_path:
-	preempt_enable();
-
-	/*
-	 * serialise with a mutex so we don't burn lots of cpu on
-	 * the superblock lock. We still need to hold the superblock
-	 * lock, however, when we modify the global structures.
-	 */
-	xfs_icsb_lock(mp);
-
-	/*
-	 * Now running atomically.
-	 *
-	 * If the counter is enabled, someone has beaten us to rebalancing.
-	 * Drop the lock and try again in the fast path....
-	 */
-	if (!(xfs_icsb_counter_disabled(mp, field))) {
-		xfs_icsb_unlock(mp);
-		goto again;
-	}
-
-	/*
-	 * The counter is currently disabled. Because we are
-	 * running atomically here, we know a rebalance cannot
-	 * be in progress. Hence we can go straight to operating
-	 * on the global superblock. We do not call xfs_mod_incore_sb()
-	 * here even though we need to get the m_sb_lock. Doing so
-	 * will cause us to re-enter this function and deadlock.
-	 * Hence we get the m_sb_lock ourselves and then call
-	 * xfs_mod_incore_sb_unlocked() as the unlocked path operates
-	 * directly on the global counters.
-	 */
-	spin_lock(&mp->m_sb_lock);
-	ret = xfs_mod_incore_sb_unlocked(mp, field, delta, rsvd);
-	spin_unlock(&mp->m_sb_lock);
-
-	/*
-	 * Now that we've modified the global superblock, we
-	 * may be able to re-enable the distributed counters
-	 * (e.g. lots of space just got freed). After that
-	 * we are done.
-	 */
-	if (ret != ENOSPC)
-		xfs_icsb_balance_counter(mp, field, 0);
-	xfs_icsb_unlock(mp);
-	return ret;
-
-balance_counter:
-	xfs_icsb_unlock_cntr(icsbp);
-	preempt_enable();
-
-	/*
-	 * We may have multiple threads here if multiple per-cpu
-	 * counters run dry at the same time. This will mean we can
-	 * do more balances than strictly necessary but it is not
-	 * the common slowpath case.
-	 */
-	xfs_icsb_lock(mp);
-
-	/*
-	 * running atomically.
-	 *
-	 * This will leave the counter in the correct state for future
-	 * accesses. After the rebalance, we simply try again and our retry
-	 * will either succeed through the fast path or slow path without
-	 * another balance operation being required.
-	 */
-	xfs_icsb_balance_counter(mp, field, delta);
-	xfs_icsb_unlock(mp);
-	goto again;
-}
-
-#endif
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 5861b49..7efae1d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -65,44 +65,19 @@ struct xfs_nameops;
 struct xfs_ail;
 struct xfs_quotainfo;
 
-#ifdef HAVE_PERCPU_SB
-
 /*
- * Valid per-cpu incore superblock counters. Note that if you add new counters,
- * you may need to define new counter disabled bit field descriptors as there
- * are more possible fields in the superblock that can fit in a bitfield on a
- * 32 bit platform. The XFS_SBS_* values for the current current counters just
- * fit.
+ * Per-cpu incore superblock counters.
  */
-typedef struct xfs_icsb_cnts {
-	uint64_t	icsb_fdblocks;
-	uint64_t	icsb_ifree;
-	uint64_t	icsb_icount;
-	unsigned long	icsb_flags;
-} xfs_icsb_cnts_t;
-
-#define XFS_ICSB_FLAG_LOCK	(1 << 0)	/* counter lock bit */
+enum {
+	XFS_ICSB_FDBLOCKS = 0,
+	XFS_ICSB_IFREE,
+	XFS_ICSB_ICOUNT,
+	XFS_ICSB_MAX,
+};
 
-#define XFS_ICSB_LAZY_COUNT	(1 << 1)	/* accuracy not needed */
-
-extern int	xfs_icsb_init_counters(struct xfs_mount *);
-extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
-extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
-extern void	xfs_icsb_sync_counters(struct xfs_mount *, int);
-extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
 extern int	xfs_icsb_modify_counters(struct xfs_mount *, xfs_sb_field_t,
 						int64_t, int);
 
-#else
-#define xfs_icsb_init_counters(mp)		(0)
-#define xfs_icsb_destroy_counters(mp)		do { } while (0)
-#define xfs_icsb_reinit_counters(mp)		do { } while (0)
-#define xfs_icsb_sync_counters(mp, flags)	do { } while (0)
-#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
-#define xfs_icsb_modify_counters(mp, field, delta, rsvd) \
-	xfs_mod_incore_sb(mp, field, delta, rsvd)
-#endif
-
 typedef struct xfs_mount {
 	struct super_block	*m_super;
 	xfs_tid_t		m_tid;		/* next unused tid for fs */
@@ -186,12 +161,6 @@ typedef struct xfs_mount {
 	struct xfs_chash	*m_chash;	/* fs private inode per-cluster
 						 * hash table */
 	atomic_t		m_active_trans;	/* number trans frozen */
-#ifdef HAVE_PERCPU_SB
-	xfs_icsb_cnts_t __percpu *m_sb_cnts;	/* per-cpu superblock counters */
-	unsigned long		m_icsb_counters; /* disabled per-cpu counters */
-	struct notifier_block	m_icsb_notifier; /* hotplug cpu notifier */
-	struct mutex		m_icsb_mutex;	/* balancer sync lock */
-#endif
 	struct xfs_mru_cache	*m_filestream;  /* per-mount filestream data */
 	struct task_struct	*m_sync_task;	/* generalised sync thread */
 	xfs_sync_work_t		m_sync_work;	/* work item for VFS_SYNC */
@@ -202,6 +171,7 @@ typedef struct xfs_mount {
 	__int64_t		m_update_flags;	/* sb flags we need to update
 						   on the next remount,rw */
 	struct shrinker		m_inode_shrink;	/* inode reclaim shrinker */
+	struct percpu_counter	m_icsb[XFS_ICSB_MAX];
 } xfs_mount_t;
 
 /*
@@ -333,26 +303,6 @@ struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *mp, xfs_agnumber_t agno,
 void	xfs_perag_put(struct xfs_perag *pag);
 
 /*
- * Per-cpu superblock locking functions
- */
-#ifdef HAVE_PERCPU_SB
-static inline void
-xfs_icsb_lock(xfs_mount_t *mp)
-{
-	mutex_lock(&mp->m_icsb_mutex);
-}
-
-static inline void
-xfs_icsb_unlock(xfs_mount_t *mp)
-{
-	mutex_unlock(&mp->m_icsb_mutex);
-}
-#else
-#define xfs_icsb_lock(mp)
-#define xfs_icsb_unlock(mp)
-#endif
-
-/*
  * This structure is for use by the xfs_mod_incore_sb_batch() routine.
  * xfs_growfs can specify a few fields which are more than int limit
  */
@@ -379,6 +329,20 @@ extern int	xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
 
 extern int	xfs_dev_is_read_only(struct xfs_mount *, char *);
 
+extern int	xfs_icsb_init_counters(struct xfs_mount *);
+extern void	xfs_icsb_reinit_counters(struct xfs_mount *);
+extern void	xfs_icsb_destroy_counters(struct xfs_mount *);
+extern void	xfs_icsb_sync_counters_locked(struct xfs_mount *);
+
+static inline void
+xfs_icsb_sync_counters(
+	struct xfs_mount	*mp)
+{
+	spin_lock(&mp->m_sb_lock);
+	xfs_icsb_sync_counters_locked(mp);
+	spin_unlock(&mp->m_sb_lock);
+}
+
 #endif	/* __KERNEL__ */
 
 extern void	xfs_mod_sb(struct xfs_trans *, __int64_t);
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 46f6ba5..32014a4 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -41,6 +41,8 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
 void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs);
+int percpu_counter_add_unless_lt(struct percpu_counter *fbc, s64 amount,
+							s64 threshold);
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
@@ -153,6 +155,20 @@ static inline int percpu_counter_initialized(struct percpu_counter *fbc)
 	return 1;
 }
 
+static inline int percpu_counter_test_and_add_delta(struct percpu_counter *fbc, s64 delta)
+{
+	s64 count;
+
+	preempt_disable();
+	count = fbc->count + delta;
+	if (count < 0)
+		return -1;
+	fbc->count = count;
+	preempt_enable();
+	return count ? 1 : 0;
+}
+
+
 #endif	/* CONFIG_SMP */
 
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 604678d..13c4ff3 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -213,6 +213,85 @@ int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 }
 EXPORT_SYMBOL(percpu_counter_compare);
 
+/**
+ *
+ * percpu_counter_add_unless_lt - add to a counter avoiding underruns
+ * @fbc:	counter
+ * @amount:	amount to add
+ * @threshold:	underrun threshold
+ *
+ * Add @amount to @fdc if and only if result of addition is greater than or
+ * equal to @threshold  Return 1 if greater and added, 0 if equal and added
+ * and -1 if and underrun would have occured.
+ *
+ * This is useful for operations that must accurately and atomically only add a
+ * delta to a counter if the result is greater than a given (e.g. for freespace
+ * accounting with ENOSPC checking in filesystems).
+ */
+int percpu_counter_add_unless_lt(struct percpu_counter *fbc, s64 amount, s64
+threshold)
+{
+	s64	count;
+	s64	error = 2 * percpu_counter_batch * num_online_cpus();
+	int	cpu;
+	int	ret = -1;
+
+	preempt_disable();
+
+	/* Check to see if rough count will be sufficient for comparison */
+	count = percpu_counter_read(fbc);
+	if (count + amount < threshold - error)
+		goto out;
+
+	/*
+	 * If the counter is over the threshold and the change is less than the
+	 * batch size, we might be able to avoid locking.
+	 */
+	if (count > threshold + error && abs(amount) < percpu_counter_batch) {
+		__percpu_counter_add(fbc, amount, percpu_counter_batch);
+		ret = 1;
+		goto out;
+	}
+
+	/*
+	 * If the result is is over the error threshold, we can just add it
+	 * into the global counter ignoring what is in the per-cpu counters
+	 * as they will not change the result of the calculation.
+	 */
+	spin_lock(&fbc->lock);
+	if (fbc->count + amount > threshold + error) {
+		fbc->count += amount;
+		ret = 1;
+		goto out_unlock;
+	}
+
+	/*
+	 * Result is withing the error margin. Run an open-coded sum of the
+	 * per-cpu counters to get the exact value at this point in time,
+	 * and if the result woul dbe above the threshold, add the amount to
+	 * the global counter.
+	 */
+	count = fbc->count;
+	for_each_online_cpu(cpu) {
+		s32 *pcount = per_cpu_ptr(fbc->counters, cpu);
+		count += *pcount;
+	}
+	WARN_ON(count < threshold);
+
+	if (count + amount >= threshold) {
+		ret = 0;
+		if (count + amount > threshold)
+			ret = 1;
+		fbc->count += amount;
+	}
+out_unlock:
+	spin_unlock(&fbc->lock);
+out:
+	preempt_enable();
+	return ret;
+}
+EXPORT_SYMBOL(percpu_counter_add_unless_lt);
+
 static int __init percpu_counter_startup(void)
 {
 	compute_batch_value();
-- 
1.7.2.3

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs