Re: raid5d hangs when stopping an array during reshape

Artur Paszkiewicz <artur.paszkiewicz@xxxxxxxxx> · Thu, 25 Feb 2016 17:05:17 +0100

On 02/25/2016 02:17 AM, Shaohua Li wrote:
> On Thu, Feb 25, 2016 at 11:31:04AM +1100, Neil Brown wrote:
>> On Thu, Feb 25 2016, Shaohua Li wrote:
>>
>>>
>>> As for the bug, write requests run in raid5d, mddev_suspend() waits for all IO,
>>> which waits for the write requests. So this is a clear deadlock. I think we
>>> should delete the check_reshape() in md_check_recovery(). If we change
>>> layout/disks/chunk_size, check_reshape() is already called. If we start an
>>> array, the .run() already handles new layout. There is no point
>>> md_check_recovery() check_reshape() again.
>>
>> Are you sure?
>> Did you look at the commit which added that code?
>> commit b4c4c7b8095298ff4ce20b40bf180ada070812d0
>>
>> When there is an IO error, reshape (or resync or recovery) will abort
>> and then possibly be automatically restarted.
> 
> thanks pointing out this. 
>> Without the check here a reshape might be attempted on an array which
>> has failed.  Not sure if that would be harmful, but it would certainly
>> be pointless.
>>
>> But you are right that this is causing the problem.
>> Maybe we should keep track of the size of the 'scribble' arrays and only
>> call resize_chunks if the size needs to change?  Similar to what
>> resize_stripes does.
> 
> yep, this is my first solution, but think check_reshape() is useless here
> later, apparently miss the restart case. I'll go this way.

My idea was to replace mddev_suspend()/mddev_resume() in resize_chunks()
with a rw lock that would prevent collisions with raid_run_ops(), since
scribble is used only there. But if the parity operations are executed
asynchronously this would also need to wait until all the submitted
operations have completed. Seems a bit overkill, but I came up with
this:

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a086014..3b7bbec 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -55,6 +55,7 @@
 #include <linux/ratelimit.h>
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
+#include <linux/delay.h>
 #include <trace/events/block.h>
 
 #include "md.h"
@@ -1267,6 +1268,8 @@ static void ops_complete_compute(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_dec(&sh->raid_conf->scribble_count);
+
 	/* mark the computed target(s) as uptodate */
 	mark_target_uptodate(sh, sh->ops.target);
 	mark_target_uptodate(sh, sh->ops.target2);
@@ -1314,6 +1317,9 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
 
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
+
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 
 	for (i = disks; i--; )
@@ -1399,6 +1405,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
 	pr_debug("%s: stripe %llu block: %d\n",
 		__func__, (unsigned long long)sh->sector, target);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	tgt = &sh->dev[target];
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	dest = tgt->page;
@@ -1449,6 +1457,9 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
 	BUG_ON(sh->batch_head);
 	pr_debug("%s: stripe %llu block1: %d block2: %d\n",
 		 __func__, (unsigned long long)sh->sector, target, target2);
+
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	BUG_ON(target < 0 || target2 < 0);
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
 	BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
@@ -1545,6 +1556,8 @@ static void ops_complete_prexor(void *stripe_head_ref)
 
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
+
+	atomic_dec(&sh->raid_conf->scribble_count);
 }
 
 static struct dma_async_tx_descriptor *
@@ -1563,6 +1576,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	for (i = disks; i--; ) {
 		struct r5dev *dev = &sh->dev[i];
 		/* Only process blocks that are known to be uptodate */
@@ -1588,6 +1603,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu,
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_WANT_DRAIN);
 
 	init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx,
@@ -1672,6 +1689,8 @@ static void ops_complete_reconstruct(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_dec(&sh->raid_conf->scribble_count);
+
 	for (i = disks; i--; ) {
 		fua |= test_bit(R5_WantFUA, &sh->dev[i].flags);
 		sync |= test_bit(R5_SyncIO, &sh->dev[i].flags);
@@ -1722,6 +1741,8 @@ ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	for (i = 0; i < sh->disks; i++) {
 		if (pd_idx == i)
 			continue;
@@ -1804,6 +1825,8 @@ ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
 
 	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	for (i = 0; i < sh->disks; i++) {
 		if (sh->pd_idx == i || sh->qd_idx == i)
 			continue;
@@ -1857,6 +1880,8 @@ static void ops_complete_check(void *stripe_head_ref)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_dec(&sh->raid_conf->scribble_count);
+
 	sh->check_state = check_state_check_result;
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
@@ -1877,6 +1902,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
 	pr_debug("%s: stripe %llu\n", __func__,
 		(unsigned long long)sh->sector);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	BUG_ON(sh->batch_head);
 	count = 0;
 	xor_dest = sh->dev[pd_idx].page;
@@ -1906,6 +1933,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu
 	pr_debug("%s: stripe %llu checkp: %d\n", __func__,
 		(unsigned long long)sh->sector, checkp);
 
+	atomic_inc(&sh->raid_conf->scribble_count);
+
 	BUG_ON(sh->batch_head);
 	count = set_syndrome_sources(srcs, sh, SYNDROME_SRC_ALL);
 	if (!checkp)
@@ -1927,6 +1956,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 	struct raid5_percpu *percpu;
 	unsigned long cpu;
 
+	down_read(&conf->scribble_lock);
 	cpu = get_cpu();
 	percpu = per_cpu_ptr(conf->percpu, cpu);
 	if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
@@ -1985,6 +2015,7 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
 				wake_up(&sh->raid_conf->wait_for_overlap);
 		}
 	put_cpu();
+	up_read(&conf->scribble_lock);
 }
 
 static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp)
@@ -2089,7 +2120,10 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
 	unsigned long cpu;
 	int err = 0;
 
-	mddev_suspend(conf->mddev);
+	down_write(&conf->scribble_lock);
+	/* wait for async operations using scribble to complete */
+	while (atomic_read(&conf->scribble_count))
+		udelay(10);
 	get_online_cpus();
 	for_each_present_cpu(cpu) {
 		struct raid5_percpu *percpu;
@@ -2109,7 +2143,8 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors)
 		}
 	}
 	put_online_cpus();
-	mddev_resume(conf->mddev);
+	up_write(&conf->scribble_lock);
+
 	return err;
 }
 
@@ -6501,6 +6536,8 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	spin_lock_init(&conf->device_lock);
 	seqcount_init(&conf->gen_lock);
 	mutex_init(&conf->cache_size_mutex);
+	init_rwsem(&conf->scribble_lock);
+	atomic_set(&conf->scribble_count, 0);
 	init_waitqueue_head(&conf->wait_for_quiescent);
 	for (i = 0; i < NR_STRIPE_HASH_LOCKS; i++) {
 		init_waitqueue_head(&conf->wait_for_stripe[i]);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index a415e1c..8361156 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -3,6 +3,7 @@
 
 #include <linux/raid/xor.h>
 #include <linux/dmaengine.h>
+#include <linux/rwsem.h>
 
 /*
  *
@@ -494,6 +495,9 @@ struct r5conf {
 	struct kmem_cache	*slab_cache; /* for allocating stripes */
 	struct mutex		cache_size_mutex; /* Protect changes to cache size */
 
+	struct rw_semaphore	scribble_lock;
+	atomic_t		scribble_count;
+
 	int			seq_flush, seq_write;
 	int			quiesce;
 
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html