Re: [PATCH] md: simplify flush request handling

Xiao Ni <xni@xxxxxxxxxx> · Fri, 11 May 2018 10:46:59 +0800

Hi Shaohua


On 05/11/2018 06:23 AM, Shaohua Li wrote:
From: Shaohua Li <shli@xxxxxx>

The recent flush request handling seems unncessary complicated. The main
issue is in rdev_end_flush we can either get rdev of the bio or the
flush_info, not both, or we need extra memory to for the other. With the
extra memory, we need reallocate the memory in disk hotadd/remove.
Actually the original patch forgets one case of add_new_disk for memory
allocation, and we have kernel crash.

add_new_disk just adds disk to md as a spare disk. After reshape raid
disks update_raid_disks realloc memory. Why is there a kernel crash?
Could you explain more?

The idea is always to increase all rdev reference in md_flush_request
and decrease the references after bio finish. In this way,
rdev_end_flush doesn't need to know rdev, so we don't need to allocate
extra memory.

Is there a situation like this? It plugs one disk to the raid after the 
flush
bios submitted to underlayer disks. After those flush bios come back there
is one more rdev in the list mddev->disks. If decrements all rdev reference
at one time, it can decrease the rdev reference which doesn't submit flush
bio.  It's the reason I try to allocate memory for bios[0] in flush_info.

Regards
Xiao

Cc: Xiao Ni <xni@xxxxxxxxxx>
Signed-off-by: Shaohua Li <shli@xxxxxx>
---
  drivers/md/md.c | 89 ++++++++++++++-------------------------------------------
  drivers/md/md.h |  3 +-
  2 files changed, 23 insertions(+), 69 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0bb1e2f..d9474f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -435,16 +435,12 @@ static void rdev_end_flush(struct bio *bi)
  	struct bio *fbio = fi->fbio;
  	struct md_rdev *rdev;
  
-	rcu_read_lock();
-	rdev_for_each_rcu(rdev, mddev)
-		if (fi->bios[rdev->raid_disk] == bi) {
-			fi->bios[rdev->raid_disk] = NULL;
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
  			rdev_dec_pending(rdev, mddev);
-			break;
-		}
-	rcu_read_unlock();
+		rcu_read_unlock();
  
-	if (atomic_dec_and_test(&fi->flush_pending)) {
  		if (fbio->bi_iter.bi_size == 0) {
  			/* an empty barrier - all done */
  			bio_endio(fbio);
@@ -465,14 +461,12 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
  {
  	struct md_rdev *rdev;
  	struct flush_info *fi;
-	char *p = (char*)mddev->flush_info;
  	int index;
  
  	atomic_inc(&mddev->flush_io);
  
  	index = jhash((void*)fbio, sizeof(fbio), 0) % NR_FLUSHS;
-	fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-			+ mddev->raid_disks * sizeof(struct bio*)));
+	fi = &mddev->flush_info[index];
  
  	spin_lock_irq(&fi->flush_lock);
  	wait_event_lock_irq(fi->flush_queue,
@@ -486,6 +480,8 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
  
  	rcu_read_lock();
  	rdev_for_each_rcu(rdev, mddev)
+		atomic_inc(&rdev->nr_pending);
+	rdev_for_each_rcu(rdev, mddev)
  		if (rdev->raid_disk >= 0 &&
  		    !test_bit(Faulty, &rdev->flags)) {
  			/* Take two references, one is dropped
@@ -493,8 +489,6 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
  			 * we reclaim rcu_read_lock
  			 */
  			struct bio *bi;
-			atomic_inc(&rdev->nr_pending);
-			atomic_inc(&rdev->nr_pending);
  			rcu_read_unlock();
  
  			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
@@ -503,16 +497,18 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
  			bio_set_dev(bi, rdev->bdev);
  			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
  
-			fi->bios[rdev->raid_disk] = bi;
  			atomic_inc(&fi->flush_pending);
  			submit_bio(bi);
  
  			rcu_read_lock();
-			rdev_dec_pending(rdev, mddev);
  		}
  	rcu_read_unlock();
  
  	if (atomic_dec_and_test(&fi->flush_pending)) {
+		rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
+			rdev_dec_pending(rdev, mddev);
+		rcu_read_unlock();
  		if (fbio->bi_iter.bi_size == 0) {
  			/* an empty barrier - all done */
  			bio_endio(fbio);
@@ -573,6 +569,16 @@ static void md_safemode_timeout(struct timer_list *t);
  
  void mddev_init(struct mddev *mddev)
  {
+	int index = 0;
+	struct flush_info *fi;
+
+	while (index < NR_FLUSHS) {
+		fi = &mddev->flush_info[index];
+		spin_lock_init(&fi->flush_lock);
+		init_waitqueue_head(&fi->flush_queue);
+		index++;
+	}
+
  	mutex_init(&mddev->open_mutex);
  	mutex_init(&mddev->reconfig_mutex);
  	mutex_init(&mddev->bitmap_info.mutex);
@@ -5537,27 +5543,6 @@ int md_run(struct mddev *mddev)
  			goto abort;
  		}
  	}
-	if (mddev->flush_info == NULL) {
-		int index = 0;
-		char *p;
-		struct flush_info *fi;
-		mddev->flush_info = kzalloc((sizeof(struct flush_info) +
-						sizeof(struct bio*) * mddev->raid_disks) *
-						NR_FLUSHS, GFP_KERNEL);
-		if (!mddev->flush_info) {
-			err = -ENOMEM;
-			goto abort;
-		}
-
-		p = (char*)mddev->flush_info;
-		while (index < NR_FLUSHS) {
-			fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-				+ mddev->raid_disks * sizeof(struct bio*)));
-			spin_lock_init(&fi->flush_lock);
-			init_waitqueue_head(&fi->flush_queue);
-			index++;
-		}
-	}
  
  	spin_lock(&pers_lock);
  	pers = find_pers(mddev->level, mddev->clevel);
@@ -5717,9 +5702,6 @@ int md_run(struct mddev *mddev)
  	return 0;
  
  abort:
-	if (mddev->flush_info) {
-		kfree(mddev->flush_info);
-	}
  	if (mddev->bio_set) {
  		bioset_free(mddev->bio_set);
  		mddev->bio_set = NULL;
@@ -5940,10 +5922,6 @@ void md_stop(struct mddev *mddev)
  	 * This is called from dm-raid
  	 */
  	__md_stop(mddev);
-	if (mddev->flush_info) {
-		kfree(mddev->flush_info);
-		mddev->flush_info = NULL;
-	}
  	if (mddev->bio_set) {
  		bioset_free(mddev->bio_set);
  		mddev->bio_set = NULL;
@@ -6910,10 +6888,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
  
  static int update_raid_disks(struct mddev *mddev, int raid_disks)
  {
-	int rv, index;
+	int rv;
  	struct md_rdev *rdev;
-	struct flush_info *new, *fi;
-	char *p;
  	/* change the number of raid disks */
  	if (mddev->pers->check_reshape == NULL)
  		return -EINVAL;
@@ -6942,31 +6918,10 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
  	else if (mddev->delta_disks > 0)
  		mddev->reshape_backwards = 0;
  
-	new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) *
-				raid_disks) * NR_FLUSHS, GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	p = (char*)new;
-	index = 0;
-	while (index < NR_FLUSHS) {
-		fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-				+ raid_disks * sizeof(struct bio*)));
-		spin_lock_init(&fi->flush_lock);
-		init_waitqueue_head(&fi->flush_queue);
-		index++;
-	}
-
  	rv = mddev->pers->check_reshape(mddev);
  	if (rv < 0) {
  		mddev->delta_disks = 0;
  		mddev->reshape_backwards = 0;
-		kfree(new);
-	} else {
-		mddev_suspend(mddev);
-		kfree(mddev->flush_info);
-		mddev->flush_info = new;
-		mddev_resume(mddev);
  	}
  	return rv;
  }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 24f22f5..a3c88b1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -260,7 +260,6 @@ struct flush_info {
  	spinlock_t			flush_lock;
  	wait_queue_head_t		flush_queue;
  	struct bio			*fbio;
-	struct bio			*bios[0];
  };
  
  struct mddev {
@@ -470,7 +469,7 @@ struct mddev {
  	/*
  	 * Generic flush handling.
  	 */
-	struct flush_info		*flush_info;
+	struct flush_info		flush_info[NR_FLUSHS];
  	atomic_t			flush_io;
  
  	struct work_struct event_work;	/* used by dm to report failure event */

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html