[PATCH] md: simplify flush request handling

Shaohua Li <shli@xxxxxxxxxx> · Thu, 10 May 2018 15:23:19 -0700

From: Shaohua Li <shli@xxxxxx>

The recent flush request handling seems unncessary complicated. The main
issue is in rdev_end_flush we can either get rdev of the bio or the
flush_info, not both, or we need extra memory to for the other. With the
extra memory, we need reallocate the memory in disk hotadd/remove.
Actually the original patch forgets one case of add_new_disk for memory
allocation, and we have kernel crash.

The idea is always to increase all rdev reference in md_flush_request
and decrease the references after bio finish. In this way,
rdev_end_flush doesn't need to know rdev, so we don't need to allocate
extra memory.

Cc: Xiao Ni <xni@xxxxxxxxxx>
Signed-off-by: Shaohua Li <shli@xxxxxx>
---
 drivers/md/md.c | 89 ++++++++++++++-------------------------------------------
 drivers/md/md.h |  3 +-
 2 files changed, 23 insertions(+), 69 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0bb1e2f..d9474f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -435,16 +435,12 @@ static void rdev_end_flush(struct bio *bi)
 	struct bio *fbio = fi->fbio;
 	struct md_rdev *rdev;
 
-	rcu_read_lock();
-	rdev_for_each_rcu(rdev, mddev)
-		if (fi->bios[rdev->raid_disk] == bi) {
-			fi->bios[rdev->raid_disk] = NULL;
+	if (atomic_dec_and_test(&fi->flush_pending)) {
+		rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
 			rdev_dec_pending(rdev, mddev);
-			break;
-		}
-	rcu_read_unlock();
+		rcu_read_unlock();
 
-	if (atomic_dec_and_test(&fi->flush_pending)) {
 		if (fbio->bi_iter.bi_size == 0) {
 			/* an empty barrier - all done */
 			bio_endio(fbio);
@@ -465,14 +461,12 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
 {
 	struct md_rdev *rdev;
 	struct flush_info *fi;
-	char *p = (char*)mddev->flush_info;
 	int index;
 
 	atomic_inc(&mddev->flush_io);
 
 	index = jhash((void*)fbio, sizeof(fbio), 0) % NR_FLUSHS;
-	fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-			+ mddev->raid_disks * sizeof(struct bio*)));
+	fi = &mddev->flush_info[index];
 
 	spin_lock_irq(&fi->flush_lock);
 	wait_event_lock_irq(fi->flush_queue,
@@ -486,6 +480,8 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
 
 	rcu_read_lock();
 	rdev_for_each_rcu(rdev, mddev)
+		atomic_inc(&rdev->nr_pending);
+	rdev_for_each_rcu(rdev, mddev)
 		if (rdev->raid_disk >= 0 &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			/* Take two references, one is dropped
@@ -493,8 +489,6 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
 			 * we reclaim rcu_read_lock
 			 */
 			struct bio *bi;
-			atomic_inc(&rdev->nr_pending);
-			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 
 			bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
@@ -503,16 +497,18 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio)
 			bio_set_dev(bi, rdev->bdev);
 			bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 
-			fi->bios[rdev->raid_disk] = bi;
 			atomic_inc(&fi->flush_pending);
 			submit_bio(bi);
 
 			rcu_read_lock();
-			rdev_dec_pending(rdev, mddev);
 		}
 	rcu_read_unlock();
 
 	if (atomic_dec_and_test(&fi->flush_pending)) {
+		rcu_read_lock();
+		rdev_for_each_rcu(rdev, mddev)
+			rdev_dec_pending(rdev, mddev);
+		rcu_read_unlock();
 		if (fbio->bi_iter.bi_size == 0) {
 			/* an empty barrier - all done */
 			bio_endio(fbio);
@@ -573,6 +569,16 @@ static void md_safemode_timeout(struct timer_list *t);
 
 void mddev_init(struct mddev *mddev)
 {
+	int index = 0;
+	struct flush_info *fi;
+
+	while (index < NR_FLUSHS) {
+		fi = &mddev->flush_info[index];
+		spin_lock_init(&fi->flush_lock);
+		init_waitqueue_head(&fi->flush_queue);
+		index++;
+	}
+
 	mutex_init(&mddev->open_mutex);
 	mutex_init(&mddev->reconfig_mutex);
 	mutex_init(&mddev->bitmap_info.mutex);
@@ -5537,27 +5543,6 @@ int md_run(struct mddev *mddev)
 			goto abort;
 		}
 	}
-	if (mddev->flush_info == NULL) {
-		int index = 0;
-		char *p;
-		struct flush_info *fi;
-		mddev->flush_info = kzalloc((sizeof(struct flush_info) +
-						sizeof(struct bio*) * mddev->raid_disks) *
-						NR_FLUSHS, GFP_KERNEL);
-		if (!mddev->flush_info) {
-			err = -ENOMEM;
-			goto abort;
-		}
-
-		p = (char*)mddev->flush_info;
-		while (index < NR_FLUSHS) {
-			fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-				+ mddev->raid_disks * sizeof(struct bio*)));
-			spin_lock_init(&fi->flush_lock);
-			init_waitqueue_head(&fi->flush_queue);
-			index++;
-		}
-	}
 
 	spin_lock(&pers_lock);
 	pers = find_pers(mddev->level, mddev->clevel);
@@ -5717,9 +5702,6 @@ int md_run(struct mddev *mddev)
 	return 0;
 
 abort:
-	if (mddev->flush_info) {
-		kfree(mddev->flush_info);
-	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -5940,10 +5922,6 @@ void md_stop(struct mddev *mddev)
 	 * This is called from dm-raid
 	 */
 	__md_stop(mddev);
-	if (mddev->flush_info) {
-		kfree(mddev->flush_info);
-		mddev->flush_info = NULL;
-	}
 	if (mddev->bio_set) {
 		bioset_free(mddev->bio_set);
 		mddev->bio_set = NULL;
@@ -6910,10 +6888,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
 
 static int update_raid_disks(struct mddev *mddev, int raid_disks)
 {
-	int rv, index;
+	int rv;
 	struct md_rdev *rdev;
-	struct flush_info *new, *fi;
-	char *p;
 	/* change the number of raid disks */
 	if (mddev->pers->check_reshape == NULL)
 		return -EINVAL;
@@ -6942,31 +6918,10 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
 	else if (mddev->delta_disks > 0)
 		mddev->reshape_backwards = 0;
 
-	new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) *
-				raid_disks) * NR_FLUSHS, GFP_KERNEL);
-	if (!new)
-		return -ENOMEM;
-
-	p = (char*)new;
-	index = 0;
-	while (index < NR_FLUSHS) {
-		fi = (struct flush_info *)(p + index * (sizeof(struct flush_info)
-				+ raid_disks * sizeof(struct bio*)));
-		spin_lock_init(&fi->flush_lock);
-		init_waitqueue_head(&fi->flush_queue);
-		index++;
-	}
-
 	rv = mddev->pers->check_reshape(mddev);
 	if (rv < 0) {
 		mddev->delta_disks = 0;
 		mddev->reshape_backwards = 0;
-		kfree(new);
-	} else {
-		mddev_suspend(mddev);
-		kfree(mddev->flush_info);
-		mddev->flush_info = new;
-		mddev_resume(mddev);
 	}
 	return rv;
 }
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 24f22f5..a3c88b1 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -260,7 +260,6 @@ struct flush_info {
 	spinlock_t			flush_lock;
 	wait_queue_head_t		flush_queue;
 	struct bio			*fbio;
-	struct bio			*bios[0];
 };
 
 struct mddev {
@@ -470,7 +469,7 @@ struct mddev {
 	/*
 	 * Generic flush handling.
 	 */
-	struct flush_info		*flush_info;
+	struct flush_info		flush_info[NR_FLUSHS];
 	atomic_t			flush_io;
 
 	struct work_struct event_work;	/* used by dm to report failure event */
-- 
2.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html