Re: [RFT PATCH] blkio: alloc per cpu data from worker thread context( Re: kvm deadlock)

Vivek Goyal <vgoyal@xxxxxxxxxx> · Fri, 16 Dec 2011 15:29:07 -0500

On Fri, Dec 16, 2011 at 12:43:52PM -0600, Nate Custer wrote:
> 
> On Dec 15, 2011, at 1:47 PM, Vivek Goyal wrote:
> > Ok, I continued to develop on the patch which tries to allocate per cpu
> > stats from worker thread context. Here is the patch.
> > 
> > Can the reporter please try out the patch and see if it helps. I am not
> > sure if deadlock was because of mutex issue or not, but it should help
> > get rid of lockdep warning.
> > 
> > This patch is on top of for-3.3/core branch of jens's linux-block tree.
> > If it does not apply on your kernel version, do let me know the version 
> > you are testing with and I will generate another version of patch.
> > 
> > If testing results are good, I will break down the patch in small pieces
> > and post as a series separately.
> > 
> > Thanks
> > Vivek
> 
> Running on a fedora-16 box with the patch applied to the linux-block tree I still had deadlocks. In fact it seemed to happen much faster and with ligher workloads.
> 
> I was able to get netconsole setup and a full stacktrace is posted here:
> 
> http://pastebin.com/9Rq68exU

Thanks for testing it Nate. I did some debugging and found out that patch
is doing double free on per cpu pointer hence the crash you are running
into. I could reproduce this problem on my box. It is just a matter of
doing rmdir on the blkio cgroup.

I understood the cmpxchg() semantics wrong. I have fixed it now and
no crashes on directory removal. Can you please give this version a
try.

Thanks
Vivek

Alloc per cpu data from a worker thread context to avoid possibility of a
deadlock under memory pressure.

Only side affect of delayed allocation is that we will lose the blkio cgroup
stats for that group a small duration.

This patch is generated on top of "for-3.3/core" branch of linux-block tree.

-v2:
 Fixed the issue of double free on percpu pointer.
---
 block/blk-cgroup.c   |   31 ++++++++++++-
 block/blk-cgroup.h   |    9 +++
 block/blk-throttle.c |  116 ++++++++++++++++++++++++++++++++-----------------
 block/cfq-iosched.c  |  119 ++++++++++++++++++++++++++++++---------------------
 4 files changed, 180 insertions(+), 95 deletions(-)

Index: block/blk-throttle.c
===================================================================

--- block/blk-throttle.c.orig	2011-12-17 01:49:34.000000000 -0500
+++ block/blk-throttle.c	2011-12-17 02:21:50.000000000 -0500
@@ -81,6 +81,7 @@ struct throtl_grp {
 	int limits_changed;
 
 	struct rcu_head rcu_head;
+	struct work_struct stat_alloc_work;
 };
 
 struct throtl_data
@@ -159,6 +160,13 @@ static void throtl_free_tg(struct rcu_he
 	struct throtl_grp *tg;
 
 	tg = container_of(head, struct throtl_grp, rcu_head);
+
+	/*
+	 * Will delayed allocation be visible here for sure? I am relying
+	 * on the fact that after blkg.stats_cpu assignment, we drop
+	 * reference to group using atomic_dec() which should imply
+	 * barrier
+	 */
 	free_percpu(tg->blkg.stats_cpu);
 	kfree(tg);
 }
@@ -181,6 +189,48 @@ static void throtl_put_tg(struct throtl_
 	call_rcu(&tg->rcu_head, throtl_free_tg);
 }
 
+static inline void throtl_check_schedule_pcpu_stat_alloc(struct throtl_grp *tg) {
+	if (tg->blkg.stats_cpu != NULL)
+		return;
+	/*
+	 * Schedule the group per cpu stat allocation through worker
+	 * thread
+	 */
+	throtl_ref_get_tg(tg);
+	if (!schedule_work(&tg->stat_alloc_work)) {
+		/* Work is already scheduled by somebody */
+		throtl_put_tg(tg);
+	}
+}
+
+/* No locks taken. A reference to throtl_grp taken before invocation */
+static void tg_stat_alloc_fn(struct work_struct *work)
+{
+	struct throtl_grp *tg = container_of(work, struct throtl_grp,
+						stat_alloc_work);
+	void *stat_ptr = NULL;
+	unsigned long ret;
+
+	stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+	if (stat_ptr == NULL) {
+		/* If memory allocation failed, try again */
+		throtl_check_schedule_pcpu_stat_alloc(tg);
+		throtl_put_tg(tg);
+		return;
+	}
+
+	ret = blkio_cmpxchg_blkg_stats(&tg->blkg, 0,
+					(unsigned long)stat_ptr);
+
+	if (ret != 0) {
+		/* Somebody else got to it first */
+		free_percpu(stat_ptr);
+	}
+
+	throtl_put_tg(tg);
+}
+
 static void throtl_init_group(struct throtl_grp *tg)
 {
 	INIT_HLIST_NODE(&tg->tg_node);
@@ -188,6 +238,7 @@ static void throtl_init_group(struct thr
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 	tg->limits_changed = false;
+	INIT_WORK(&tg->stat_alloc_work, tg_stat_alloc_fn);
 
 	/* Practically unlimited BW */
 	tg->bps[0] = tg->bps[1] = -1;
@@ -263,8 +314,25 @@ static void throtl_init_add_tg_lists(str
 	throtl_add_group_to_td_list(td, tg);
 }
 
-/* Should be called without queue lock and outside of rcu period */
-static struct throtl_grp *throtl_alloc_tg(struct throtl_data *td)
+/* Allocates per cpu stats asynchronously with the help of worker thread */
+static struct throtl_grp *throtl_alloc_tg_async(struct throtl_data *td)
+{
+	struct throtl_grp *tg = NULL;
+
+	tg = kzalloc_node(sizeof(*tg), GFP_ATOMIC, td->queue->node);
+	if (!tg)
+		return NULL;
+
+	throtl_init_group(tg);
+	throtl_check_schedule_pcpu_stat_alloc(tg);
+	return tg;
+}
+
+/*
+ * Should be called without queue lock and outside of rcu period as per
+ * cpu alloc will block
+ */
+static struct throtl_grp *throtl_alloc_tg_sync(struct throtl_data *td)
 {
 	struct throtl_grp *tg = NULL;
 	int ret;
@@ -273,7 +341,7 @@ static struct throtl_grp *throtl_alloc_t
 	if (!tg)
 		return NULL;
 
-	ret = blkio_alloc_blkg_stats(&tg->blkg);
+	ret = blkio_alloc_set_blkg_stats(&tg->blkg);
 
 	if (ret) {
 		kfree(tg);
@@ -305,7 +373,7 @@ throtl_grp *throtl_find_tg(struct throtl
 
 static struct throtl_grp * throtl_get_tg(struct throtl_data *td)
 {
-	struct throtl_grp *tg = NULL, *__tg = NULL;
+	struct throtl_grp *tg = NULL;
 	struct blkio_cgroup *blkcg;
 	struct request_queue *q = td->queue;
 
@@ -321,46 +389,12 @@ static struct throtl_grp * throtl_get_tg
 		return tg;
 	}
 
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 */
-	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
-
-	tg = throtl_alloc_tg(td);
-
-	/* Group allocated and queue is still alive. take the lock */
-	spin_lock_irq(q->queue_lock);
-
-	/* Make sure @q is still alive */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		kfree(tg);
-		return NULL;
-	}
-
-	/*
-	 * Initialize the new group. After sleeping, read the blkcg again.
-	 */
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-
-	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
-	 */
-	__tg = throtl_find_tg(td, blkcg);
-
-	if (__tg) {
-		kfree(tg);
-		rcu_read_unlock();
-		return __tg;
-	}
+	tg = throtl_alloc_tg_async(td);
 
 	/* Group allocation failed. Account the IO to root group */
 	if (!tg) {
 		tg = td->root_tg;
+		rcu_read_unlock();
 		return tg;
 	}
 
@@ -1254,7 +1288,7 @@ int blk_throtl_init(struct request_queue
 
 	/* alloc and Init root group. */
 	td->queue = q;
-	tg = throtl_alloc_tg(td);
+	tg = throtl_alloc_tg_sync(td);
 
 	if (!tg) {
 		kfree(td);
Index: block/blk-cgroup.c
===================================================================
--- block/blk-cgroup.c.orig	2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.c	2011-12-17 01:49:35.000000000 -0500
@@ -400,6 +400,9 @@ void blkiocg_update_dispatch_stats(struc
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	if (blkg->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -446,6 +449,9 @@ void blkiocg_update_io_merged_stats(stru
 	struct blkio_group_stats_cpu *stats_cpu;
 	unsigned long flags;
 
+	if (blkg->stats_cpu == NULL)
+		return;
+
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
@@ -465,9 +471,11 @@ EXPORT_SYMBOL_GPL(blkiocg_update_io_merg
 
 /*
  * This function allocates the per cpu stats for blkio_group. Should be called
- * from sleepable context as alloc_per_cpu() requires that.
+ * from sleepable context as alloc_per_cpu() requires that. percpu alloc does
+ * not take any flags and does GFP_KERNEL allocations. Don't use it from
+ * IO submission path which usually might require GFP_NOIO.
  */
-int blkio_alloc_blkg_stats(struct blkio_group *blkg)
+int blkio_alloc_set_blkg_stats(struct blkio_group *blkg)
 {
 	/* Allocate memory for per cpu stats */
 	blkg->stats_cpu = alloc_percpu(struct blkio_group_stats_cpu);
@@ -475,7 +483,14 @@ int blkio_alloc_blkg_stats(struct blkio_
 		return -ENOMEM;
 	return 0;
 }
-EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats);
+EXPORT_SYMBOL_GPL(blkio_alloc_set_blkg_stats);
+
+void* blkio_alloc_blkg_stats_pcpu(void)
+{
+	/* Allocate memory for per cpu stats */
+	return alloc_percpu(struct blkio_group_stats_cpu);
+}
+EXPORT_SYMBOL_GPL(blkio_alloc_blkg_stats_pcpu);
 
 void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 		struct blkio_group *blkg, void *key, dev_t dev,
@@ -551,6 +566,12 @@ static void blkio_reset_stats_cpu(struct
 {
 	struct blkio_group_stats_cpu *stats_cpu;
 	int i, j, k;
+
+	/* blkg->stats_cpu can have delayed allocation */
+
+	if (!blkg->stats_cpu)
+		return;
+
 	/*
 	 * Note: On 64 bit arch this should not be an issue. This has the
 	 * possibility of returning some inconsistent value on 32bit arch
@@ -673,6 +694,10 @@ static uint64_t blkio_read_stat_cpu(stru
 	struct blkio_group_stats_cpu *stats_cpu;
 	u64 val = 0, tval;
 
+	/* blkg->stats_cpu might not have been allocated yet */
+	if (blkg->stats_cpu == NULL)
+		return 0;
+
 	for_each_possible_cpu(cpu) {
 		unsigned int start;
 		stats_cpu  = per_cpu_ptr(blkg->stats_cpu, cpu);
Index: block/blk-cgroup.h
===================================================================
--- block/blk-cgroup.h.orig	2011-12-17 01:49:34.000000000 -0500
+++ block/blk-cgroup.h	2011-12-17 01:49:35.000000000 -0500
@@ -310,7 +310,12 @@ extern struct blkio_cgroup *task_blkio_c
 extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg,
 	struct blkio_group *blkg, void *key, dev_t dev,
 	enum blkio_policy_id plid);
-extern int blkio_alloc_blkg_stats(struct blkio_group *blkg);
+extern int blkio_alloc_set_blkg_stats(struct blkio_group *blkg);
+extern void* blkio_alloc_blkg_stats_pcpu(void);
+
+#define blkio_cmpxchg_blkg_stats(blkg, oldval, newval) \
+	cmpxchg((unsigned long *)&(blkg)->stats_cpu, oldval, newval);
+
 extern int blkiocg_del_blkio_group(struct blkio_group *blkg);
 extern struct blkio_group *blkiocg_lookup_group(struct blkio_cgroup *blkcg,
 						void *key);
@@ -338,7 +343,7 @@ static inline void blkiocg_add_blkio_gro
 		struct blkio_group *blkg, void *key, dev_t dev,
 		enum blkio_policy_id plid) {}
 
-static inline int blkio_alloc_blkg_stats(struct blkio_group *blkg) { return 0; }
+static inline int blkio_alloc_set_blkg_stats(struct blkio_group *blkg) { return 0; }
 
 static inline int
 blkiocg_del_blkio_group(struct blkio_group *blkg) { return 0; }
Index: block/cfq-iosched.c
===================================================================
--- block/cfq-iosched.c.orig	2011-12-17 01:49:34.000000000 -0500
+++ block/cfq-iosched.c	2011-12-17 02:21:50.000000000 -0500
@@ -209,7 +209,12 @@ struct cfq_group {
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
-	int ref;
+	/*
+	 * blkg per cpu stat allocation code will need to put reference
+	 * without having queue lock. Hence keep it atomic.
+	 */
+	atomic_t ref;
+	struct work_struct stat_alloc_work;
 #endif
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
@@ -1012,6 +1017,9 @@ static void cfq_group_served(struct cfq_
 }
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+static void cfq_put_cfqg(struct cfq_group *cfqg);
+static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg);
+
 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 {
 	if (blkg)
@@ -1054,6 +1062,52 @@ static void cfq_init_add_cfqg_lists(stru
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 }
 
+static inline void cfq_check_schedule_pcpu_stat_alloc(struct cfq_group *cfqg) {
+	if (cfqg->blkg.stats_cpu != NULL)
+		return;
+
+	/*
+	 * Schedule the group per cpu stat allocation through worker
+	 * thread
+	 */
+	cfq_ref_get_cfqg(cfqg);
+	if (!schedule_work(&cfqg->stat_alloc_work)) {
+		/* Work is already scheduled by somebody */
+		cfq_put_cfqg(cfqg);
+	}
+}
+
+/* No locks taken. A reference to cfq_group taken before invocation */
+static void cfqg_stat_alloc_fn(struct work_struct *work)
+{
+	struct cfq_group *cfqg = container_of(work, struct cfq_group,
+						stat_alloc_work);
+	void *stat_ptr = NULL;
+	unsigned long ret;
+
+	stat_ptr = blkio_alloc_blkg_stats_pcpu();
+
+	if (stat_ptr == NULL) {
+		/*
+		 * Memory allocation failed. Try again. Should an upper limit
+		 * be put on number of retries?
+		 */
+		cfq_check_schedule_pcpu_stat_alloc(cfqg);
+		cfq_put_cfqg(cfqg);
+		return;
+	}
+
+	ret = blkio_cmpxchg_blkg_stats(&cfqg->blkg, 0,
+					(unsigned long)stat_ptr);
+
+	if (ret != 0) {
+		/* Somebody else got to it first */
+		free_percpu(stat_ptr);
+	}
+
+	cfq_put_cfqg(cfqg);
+}
+
 /*
  * Should be called from sleepable context. No request queue lock as per
  * cpu stats are allocated dynamically and alloc_percpu needs to be called
@@ -1062,7 +1116,7 @@ static void cfq_init_add_cfqg_lists(stru
 static struct cfq_group * cfq_alloc_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg = NULL;
-	int i, j, ret;
+	int i, j;
 	struct cfq_rb_root *st;
 
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
@@ -1072,6 +1126,7 @@ static struct cfq_group * cfq_alloc_cfqg
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
+	INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
 
 	cfqg->ttime.last_end_request = jiffies;
 
@@ -1081,14 +1136,9 @@ static struct cfq_group * cfq_alloc_cfqg
 	 * elevator which will be dropped by either elevator exit
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
-	cfqg->ref = 1;
-
-	ret = blkio_alloc_blkg_stats(&cfqg->blkg);
-	if (ret) {
-		kfree(cfqg);
-		return NULL;
-	}
+	atomic_set(&cfqg->ref, 1);
 
+	cfq_check_schedule_pcpu_stat_alloc(cfqg);
 	return cfqg;
 }
 
@@ -1124,8 +1174,7 @@ cfq_find_cfqg(struct cfq_data *cfqd, str
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd)
 {
 	struct blkio_cgroup *blkcg;
-	struct cfq_group *cfqg = NULL, *__cfqg = NULL;
-	struct request_queue *q = cfqd->queue;
+	struct cfq_group *cfqg = NULL;
 
 	rcu_read_lock();
 	blkcg = task_blkio_cgroup(current);
@@ -1135,41 +1184,13 @@ static struct cfq_group *cfq_get_cfqg(st
 		return cfqg;
 	}
 
-	/*
-	 * Need to allocate a group. Allocation of group also needs allocation
-	 * of per cpu stats which in-turn takes a mutex() and can block. Hence
-	 * we need to drop rcu lock and queue_lock before we call alloc.
-	 *
-	 * Not taking any queue reference here and assuming that queue is
-	 * around by the time we return. CFQ queue allocation code does
-	 * the same. It might be racy though.
-	 */
-
-	rcu_read_unlock();
-	spin_unlock_irq(q->queue_lock);
-
 	cfqg = cfq_alloc_cfqg(cfqd);
-
-	spin_lock_irq(q->queue_lock);
-
-	rcu_read_lock();
-	blkcg = task_blkio_cgroup(current);
-
-	/*
-	 * If some other thread already allocated the group while we were
-	 * not holding queue lock, free up the group
-	 */
-	__cfqg = cfq_find_cfqg(cfqd, blkcg);
-
-	if (__cfqg) {
-		kfree(cfqg);
+	if (!cfqg) {
+		cfqg = &cfqd->root_group;
 		rcu_read_unlock();
-		return __cfqg;
+		return cfqg;
 	}
 
-	if (!cfqg)
-		cfqg = &cfqd->root_group;
-
 	cfq_init_add_cfqg_lists(cfqd, cfqg, blkcg);
 	rcu_read_unlock();
 	return cfqg;
@@ -1177,7 +1198,7 @@ static struct cfq_group *cfq_get_cfqg(st
 
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
-	cfqg->ref++;
+	atomic_inc(&cfqg->ref);
 	return cfqg;
 }
 
@@ -1189,7 +1210,7 @@ static void cfq_link_cfqq_cfqg(struct cf
 
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
-	cfqq->cfqg->ref++;
+	atomic_inc(&cfqq->cfqg->ref);
 }
 
 static void cfq_put_cfqg(struct cfq_group *cfqg)
@@ -1197,9 +1218,8 @@ static void cfq_put_cfqg(struct cfq_grou
 	struct cfq_rb_root *st;
 	int i, j;
 
-	BUG_ON(cfqg->ref <= 0);
-	cfqg->ref--;
-	if (cfqg->ref)
+	BUG_ON(atomic_read(&cfqg->ref) <= 0);
+	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb));
@@ -4025,6 +4045,7 @@ static void *cfq_init_queue(struct reque
 	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
+	INIT_WORK(&cfqg->stat_alloc_work, cfqg_stat_alloc_fn);
 	/*
 	 * Set root group reference to 2. One reference will be dropped when
 	 * all groups on cfqd->cfqg_list are being deleted during queue exit.
@@ -4032,9 +4053,9 @@ static void *cfq_init_queue(struct reque
 	 * group as it is statically allocated and gets destroyed when
 	 * throtl_data goes away.
 	 */
-	cfqg->ref = 2;
+	atomic_set(&cfqg->ref, 2);
 
-	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
+	if (blkio_alloc_set_blkg_stats(&cfqg->blkg)) {
 		kfree(cfqg);
 		kfree(cfqd);
 		return NULL;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html