There is a lock contention when there are many processes which send flush bios to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio to be NULL, otherwise get mddev->lock. This patch remove mddev->flush_bio and handle flush bio asynchronously. I did a test with command dbench -s 128 -t 4800. This is the test result: =================Without the patch============================ Operation Count AvgLat MaxLat -------------------------------------------------- Flush 5239 142.590 3972.034 Close 53114 0.176 498.236 LockX 208 0.066 0.907 Rename 2793 0.335 7.203 ReadX 98100 0.020 2.280 WriteX 67800 555.649 8238.498 Unlink 7985 1.742 446.503 UnlockX 208 0.058 1.013 FIND_FIRST 21035 0.141 3.147 SET_FILE_INFORMATION 6419 0.090 1.539 QUERY_FILE_INFORMATION 18244 0.007 0.130 QUERY_PATH_INFORMATION 55622 0.060 3.884 QUERY_FS_INFORMATION 9451 0.040 1.148 NTCreateX 63960 0.717 536.542 Throughput 12.1782 MB/sec (sync open) 128 clients 128 procs max_latency=8238.513 ms =====================With the patch=========================== Operation Count AvgLat MaxLat -------------------------------------------------- Flush 34858 36.484 668.243 Close 379883 0.107 252.232 LockX 1792 0.048 1.070 Rename 21761 0.804 266.659 ReadX 817947 0.021 42.891 WriteX 254804 142.485 948.090 Unlink 99665 3.590 899.816 UnlockX 1792 0.056 1.240 FIND_FIRST 178857 0.187 23.287 SET_FILE_INFORMATION 41612 0.135 26.575 QUERY_FILE_INFORMATION 83691 0.007 2.589 QUERY_PATH_INFORMATION 470889 0.077 83.846 QUERY_FS_INFORMATION 82764 0.056 10.368 NTCreateX 512262 0.616 809.980 Throughput 53.6545 MB/sec (sync open) 128 clients 128 procs max_latency=948.105 ms V3: Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device and uses a simple bitmap to record which resource is free. Fix a bug from v2. It should set flush_pending to 1 at first. V2: Neil pointed out two problems. One is counting error problem and another is return value when allocat memory fails. 1. counting error problem This isn't safe. It is only safe to call rdev_dec_pending() on rdevs that you previously called atomic_inc(&rdev->nr_pending); If an rdev was added to the list between the start and end of the flush, this will do something bad. Now it doesn't use bio_chain. It uses specified call back function for each flush bio. 2. Returned on IO error when kmalloc fails is wrong. I use mempool suggested by Neil in V2 3. Fixed some places pointed by Guoqing The test is done on one raid1 with 2 SSDS Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> --- drivers/md/md.c | 184 +++++++++++++++++++++++++++++++++++++++----------------- drivers/md/md.h | 23 ++++--- 2 files changed, 145 insertions(+), 62 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 254e44e..dd9038b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -412,30 +412,84 @@ static int md_congested(void *data, int bits) /* * Generic flush handling for md */ +static void submit_flushes(struct work_struct *ws) +{ + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); + struct mddev *mddev = fi->mddev; + struct bio *fbio = fi->fbio; + + clear_bit(fi->flush_id, mddev->flush_ids); + wake_up(&mddev->flush_queue); + + fbio->bi_opf &= ~REQ_PREFLUSH; + md_handle_request(mddev, fbio); +} -static void md_end_flush(struct bio *bio) +static void rdev_end_flush(struct bio *bi) { - struct md_rdev *rdev = bio->bi_private; - struct mddev *mddev = rdev->mddev; + struct flush_info *fi = bi->bi_private; + struct mddev *mddev = fi->mddev; + struct bio *fbio = fi->fbio; + struct md_rdev *rdev; - rdev_dec_pending(rdev, mddev); + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) + if (fi->bios[rdev->raid_disk] == bi) { + fi->bios[rdev->raid_disk] = NULL; + rdev_dec_pending(rdev, mddev); + break; + } + rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) { - /* The pre-request flush has finished */ - queue_work(md_wq, &mddev->flush_work); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (fbio->bi_iter.bi_size == 0) { + /* an empty barrier - all done */ + bio_endio(fbio); + clear_bit(fi->flush_id, mddev->flush_ids); + wake_up(&mddev->flush_queue); + } else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } - bio_put(bio); -} -static void md_submit_flush_data(struct work_struct *ws); + bio_put(bi); +} -static void submit_flushes(struct work_struct *ws) +void md_flush_request(struct mddev *mddev, struct bio *bio) { - struct mddev *mddev = container_of(ws, struct mddev, flush_work); struct md_rdev *rdev; + struct flush_info *fi; + unsigned long fi_bit; + char *p = (char*)mddev->flush_info; + + spin_lock_irq(&mddev->flush_lock); + fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS); + if (fi_bit == NR_FLUSHS) { + DEFINE_WAIT(w); + for (;;) { + prepare_to_wait(&mddev->flush_queue, + &w, TASK_IDLE); + fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS); + if (fi_bit != NR_FLUSHS) + break; + spin_unlock_irq(&mddev->flush_lock); + schedule(); + spin_lock_irq(&mddev->flush_lock); + } + finish_wait(&mddev->flush_queue, &w); + } + + set_bit(fi_bit, mddev->flush_ids); + fi = (struct flush_info *)(p + fi_bit * (sizeof(struct flush_info) + + mddev->raid_disks * sizeof(struct bio*))); + fi->flush_id = fi_bit; + spin_unlock_irq(&mddev->flush_lock); + + fi->mddev = mddev; + fi->fbio = bio; + atomic_set(&fi->flush_pending, 1); - INIT_WORK(&mddev->flush_work, md_submit_flush_data); - atomic_set(&mddev->flush_pending, 1); rcu_read_lock(); rdev_for_each_rcu(rdev, mddev) if (rdev->raid_disk >= 0 && @@ -448,56 +502,34 @@ static void submit_flushes(struct work_struct *ws) atomic_inc(&rdev->nr_pending); atomic_inc(&rdev->nr_pending); rcu_read_unlock(); + bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); - bi->bi_end_io = md_end_flush; - bi->bi_private = rdev; bio_set_dev(bi, rdev->bdev); + bi->bi_end_io = rdev_end_flush; + bi->bi_private = fi; bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - atomic_inc(&mddev->flush_pending); + + fi->bios[rdev->raid_disk] = bi; + atomic_inc(&fi->flush_pending); submit_bio(bi); + rcu_read_lock(); rdev_dec_pending(rdev, mddev); } rcu_read_unlock(); - if (atomic_dec_and_test(&mddev->flush_pending)) - queue_work(md_wq, &mddev->flush_work); -} -static void md_submit_flush_data(struct work_struct *ws) -{ - struct mddev *mddev = container_of(ws, struct mddev, flush_work); - struct bio *bio = mddev->flush_bio; - - /* - * must reset flush_bio before calling into md_handle_request to avoid a - * deadlock, because other bios passed md_handle_request suspend check - * could wait for this and below md_handle_request could wait for those - * bios because of suspend check - */ - mddev->flush_bio = NULL; - wake_up(&mddev->sb_wait); - - if (bio->bi_iter.bi_size == 0) - /* an empty barrier - all done */ - bio_endio(bio); - else { - bio->bi_opf &= ~REQ_PREFLUSH; - md_handle_request(mddev, bio); + if (atomic_dec_and_test(&fi->flush_pending)) { + if (bio->bi_iter.bi_size == 0) { + /* an empty barrier - all done */ + bio_endio(bio); + clear_bit(fi->flush_id, mddev->flush_ids); + wake_up(&mddev->flush_queue); + } else { + INIT_WORK(&fi->flush_work, submit_flushes); + queue_work(md_wq, &fi->flush_work); + } } } - -void md_flush_request(struct mddev *mddev, struct bio *bio) -{ - spin_lock_irq(&mddev->lock); - wait_event_lock_irq(mddev->sb_wait, - !mddev->flush_bio, - mddev->lock); - mddev->flush_bio = bio; - spin_unlock_irq(&mddev->lock); - - INIT_WORK(&mddev->flush_work, submit_flushes); - queue_work(md_wq, &mddev->flush_work); -} EXPORT_SYMBOL(md_flush_request); static inline struct mddev *mddev_get(struct mddev *mddev) @@ -555,7 +587,7 @@ void mddev_init(struct mddev *mddev) atomic_set(&mddev->openers, 0); atomic_set(&mddev->active_io, 0); spin_lock_init(&mddev->lock); - atomic_set(&mddev->flush_pending, 0); + spin_lock_init(&mddev->flush_lock); init_waitqueue_head(&mddev->sb_wait); init_waitqueue_head(&mddev->recovery_wait); mddev->reshape_position = MaxSector; @@ -5509,6 +5541,24 @@ int md_run(struct mddev *mddev) goto abort; } } + if (mddev->flush_info == NULL) { + mddev->flush_info = kzalloc((sizeof(struct flush_info) + + sizeof(struct bio*) * mddev->raid_disks) * + NR_FLUSHS, GFP_KERNEL); + if (!mddev->flush_info) { + err = -ENOMEM; + goto abort; + } + } + if (mddev->flush_ids == NULL) { + mddev->flush_ids = kzalloc(BITS_TO_LONGS(NR_FLUSHS) * sizeof(unsigned long), GFP_KERNEL); + if (!mddev->flush_ids) { + err = -ENOMEM; + goto abort; + } + } + + init_waitqueue_head(&mddev->flush_queue); spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -5668,6 +5718,12 @@ int md_run(struct mddev *mddev) return 0; abort: + if (mddev->flush_ids) { + kfree(mddev->flush_ids); + } + if (mddev->flush_info) { + kfree(mddev->flush_info); + } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; @@ -5888,6 +5944,14 @@ void md_stop(struct mddev *mddev) * This is called from dm-raid */ __md_stop(mddev); + if (mddev->flush_ids) { + kfree(mddev->flush_ids); + mddev->flush_ids = NULL; + } + if (mddev->flush_info) { + kfree(mddev->flush_info); + mddev->flush_info = NULL; + } if (mddev->bio_set) { bioset_free(mddev->bio_set); mddev->bio_set = NULL; @@ -6856,6 +6920,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) { int rv; struct md_rdev *rdev; + struct flush_info *new; /* change the number of raid disks */ if (mddev->pers->check_reshape == NULL) return -EINVAL; @@ -6884,10 +6949,21 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) else if (mddev->delta_disks > 0) mddev->reshape_backwards = 0; + new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) * + raid_disks) * NR_FLUSHS, GFP_KERNEL); + if (!new) + return -ENOMEM; + rv = mddev->pers->check_reshape(mddev); if (rv < 0) { mddev->delta_disks = 0; mddev->reshape_backwards = 0; + kfree(new); + } else { + spin_lock(&mddev->flush_lock); + kfree(mddev->flush_info); + mddev->flush_info = new; + spin_unlock(&mddev->flush_lock); } return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index fbc925c..83bb62c 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -252,6 +252,16 @@ enum mddev_sb_flags { MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ }; +#define NR_FLUSHS 16 +struct flush_info { + struct mddev *mddev; + struct work_struct flush_work; + int flush_id; + atomic_t flush_pending; + struct bio *fbio; + struct bio *bios[0]; +}; + struct mddev { void *private; struct md_personality *pers; @@ -399,7 +409,6 @@ struct mddev { struct work_struct del_work; /* used for delayed sysfs removal */ /* "lock" protects: - * flush_bio transition from NULL to !NULL * rdev superblocks, events * clearing MD_CHANGE_* * in_sync - and related safemode and MD_CHANGE changes @@ -457,13 +466,11 @@ struct mddev { * metadata and bitmap writes */ - /* Generic flush handling. - * The last to finish preflush schedules a worker to submit - * the rest of the request (without the REQ_PREFLUSH flag). - */ - struct bio *flush_bio; - atomic_t flush_pending; - struct work_struct flush_work; + struct flush_info *flush_info; + unsigned long *flush_ids; + wait_queue_head_t flush_queue; + spinlock_t flush_lock; + struct work_struct event_work; /* used by dm to report failure event */ void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); struct md_cluster_info *cluster_info; -- 2.7.4 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html