From: Shaohua Li <shli@xxxxxx> The recent flush request handling seems unncessary complicated. The main issue is in rdev_end_flush we can either get rdev of the bio or the flush_info, not both, or we need extra memory to for the other. With the extra memory, we need reallocate the memory in disk hotadd/remove. Actually the original patch forgets one case of add_new_disk for memory allocation, and we have kernel crash. The idea is always to increase all rdev reference in md_flush_request and decrease the references after bio finish. In this way, rdev_end_flush doesn't need to know rdev, so we don't need to allocate extra memory. Cc: Xiao Ni <xni@xxxxxxxxxx> Signed-off-by: Shaohua Li <shli@xxxxxx> --- drivers/md/md.c | 89 ++++++++++++++------------------------------------------- drivers/md/md.h | 3 +- 2 files changed, 23 insertions(+), 69 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 0bb1e2f..d9474f8 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -435,16 +435,12 @@ static void rdev_end_flush(struct bio *bi) struct bio *fbio = fi->fbio; struct md_rdev *rdev; - rcu_read_lock(); - rdev_for_each_rcu(rdev, mddev) - if (fi->bios[rdev->raid_disk] == bi) { - fi->bios[rdev->raid_disk] = NULL; + if (atomic_dec_and_test(&fi->flush_pending)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) rdev_dec_pending(rdev, mddev); - break; - } - rcu_read_unlock(); + rcu_read_unlock(); - if (atomic_dec_and_test(&fi->flush_pending)) { if (fbio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(fbio); @@ -465,14 +461,12 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio) { struct md_rdev *rdev; struct flush_info *fi; - char *p = (char*)mddev->flush_info; int index; atomic_inc(&mddev->flush_io); index = jhash((void*)fbio, sizeof(fbio), 0) % NR_FLUSHS; - fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) - + mddev->raid_disks * sizeof(struct bio*))); + fi = &mddev->flush_info[index]; spin_lock_irq(&fi->flush_lock); wait_event_lock_irq(fi->flush_queue, @@ -486,6 +480,8 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio) rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) + atomic_inc(&rdev->nr_pending); + rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && !test_bit(Faulty, &rdev->flags)) { /* Take two references, one is dropped @@ -493,8 +489,6 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio) * we reclaim rcu_read_lock */ struct bio *bi; - atomic_inc(&rdev->nr_pending); - atomic_inc(&rdev->nr_pending); rcu_read_unlock(); bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); @@ -503,16 +497,18 @@ void md_flush_request(struct mddev *mddev, struct bio *fbio) bio_set_dev(bi, rdev->bdev); bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - fi->bios[rdev->raid_disk] = bi; atomic_inc(&fi->flush_pending); submit_bio(bi); rcu_read_lock(); - rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); if (atomic_dec_and_test(&fi->flush_pending)) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + rdev_dec_pending(rdev, mddev); + rcu_read_unlock(); if (fbio->bi_iter.bi_size == 0) { /* an empty barrier - all done */ bio_endio(fbio); @@ -573,6 +569,16 @@ static void md_safemode_timeout(struct timer_list *t); void mddev_init(struct mddev *mddev) { + int index = 0; + struct flush_info *fi; + + while (index < NR_FLUSHS) { + fi = &mddev->flush_info[index]; + spin_lock_init(&fi->flush_lock); + init_waitqueue_head(&fi->flush_queue); + index++; + } + mutex_init(&mddev->open_mutex); mutex_init(&mddev->reconfig_mutex); mutex_init(&mddev->bitmap_info.mutex); @@ -5537,27 +5543,6 @@ int md_run(struct mddev *mddev) goto abort; } } - if (mddev->flush_info == NULL) { - int index = 0; - char *p; - struct flush_info *fi; - mddev->flush_info = kzalloc((sizeof(struct flush_info) + - sizeof(struct bio*) * mddev->raid_disks) * - NR_FLUSHS, GFP_KERNEL); - if (!mddev->flush_info) { - err = -ENOMEM; - goto abort; - } - - p = (char*)mddev->flush_info; - while (index < NR_FLUSHS) { - fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) - + mddev->raid_disks * sizeof(struct bio*))); - spin_lock_init(&fi->flush_lock); - init_waitqueue_head(&fi->flush_queue); - index++; - } - } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5717,9 +5702,6 @@ int md_run(struct mddev *mddev) return 0; abort: - if (mddev->flush_info) { - kfree(mddev->flush_info); - } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; @@ -5940,10 +5922,6 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); - if (mddev->flush_info) { - kfree(mddev->flush_info); - mddev->flush_info = NULL; - } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; @@ -6910,10 +6888,8 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) static int update_raid_disks(struct mddev *mddev, int raid_disks) { - int rv, index; + int rv; struct md_rdev *rdev; - struct flush_info *new, *fi; - char *p; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; @@ -6942,31 +6918,10 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) else if (mddev->delta_disks > 0) mddev->reshape_backwards = 0; - new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) * - raid_disks) * NR_FLUSHS, GFP_KERNEL); - if (!new) - return -ENOMEM; - - p = (char*)new; - index = 0; - while (index < NR_FLUSHS) { - fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) - + raid_disks * sizeof(struct bio*))); - spin_lock_init(&fi->flush_lock); - init_waitqueue_head(&fi->flush_queue); - index++; - } - rv = mddev->pers->check_reshape(mddev); if (rv < 0) { mddev->delta_disks = 0; mddev->reshape_backwards = 0; - kfree(new); - } else { - mddev_suspend(mddev); - kfree(mddev->flush_info); - mddev->flush_info = new; - mddev_resume(mddev); } return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 24f22f5..a3c88b1 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -260,7 +260,6 @@ struct flush_info { spinlock_t flush_lock; wait_queue_head_t flush_queue; struct bio *fbio; - struct bio *bios[0]; }; struct mddev { @@ -470,7 +469,7 @@ struct mddev { /* * Generic flush handling. */ - struct flush_info *flush_info; + struct flush_info flush_info[NR_FLUSHS]; atomic_t flush_io; struct work_struct event_work; /* used by dm to report failure event */ -- 2.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html