On Wed, Mar 21, 2018 at 02:47:22PM +0800, Xiao Ni wrote: > There is a lock contention when there are many processes which send flush bios > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. > > Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio > to be NULL, otherwise get mddev->lock. > > This patch remove mddev->flush_bio and handle flush bio asynchronously. > I did a test with command dbench -s 128 -t 4800. This is the test result: > > =================Without the patch============================ > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 5239 142.590 3972.034 > Close 53114 0.176 498.236 > LockX 208 0.066 0.907 > Rename 2793 0.335 7.203 > ReadX 98100 0.020 2.280 > WriteX 67800 555.649 8238.498 > Unlink 7985 1.742 446.503 > UnlockX 208 0.058 1.013 > FIND_FIRST 21035 0.141 3.147 > SET_FILE_INFORMATION 6419 0.090 1.539 > QUERY_FILE_INFORMATION 18244 0.007 0.130 > QUERY_PATH_INFORMATION 55622 0.060 3.884 > QUERY_FS_INFORMATION 9451 0.040 1.148 > NTCreateX 63960 0.717 536.542 > > Throughput 12.1782 MB/sec (sync open) 128 clients 128 procs max_latency=8238.513 ms > > =====================With the patch=========================== > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 34858 36.484 668.243 > Close 379883 0.107 252.232 > LockX 1792 0.048 1.070 > Rename 21761 0.804 266.659 > ReadX 817947 0.021 42.891 > WriteX 254804 142.485 948.090 > Unlink 99665 3.590 899.816 > UnlockX 1792 0.056 1.240 > FIND_FIRST 178857 0.187 23.287 > SET_FILE_INFORMATION 41612 0.135 26.575 > QUERY_FILE_INFORMATION 83691 0.007 2.589 > QUERY_PATH_INFORMATION 470889 0.077 83.846 > QUERY_FS_INFORMATION 82764 0.056 10.368 > NTCreateX 512262 0.616 809.980 > > Throughput 53.6545 MB/sec (sync open) 128 clients 128 procs max_latency=948.105 ms > > V3: > Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device > and uses a simple bitmap to record which resource is free. Sorry for the delay. The bitmap method is still too complicated. Can we do something like this: in mddev: struct flush_info flush_infos[8 or 16]; Maybe don't need a special struct too. the info can include a lock, every time we handle a flush, we select a flush_info by flush_infos[jhash(flush_request_bio_address)]; we then take the lock and do whatever current code does against the specific flush_info. Isn't this a simpler implementation? I think 8 or 16 locks should reduce most of the lock contention. Thanks, Shaohua > Fix a bug from v2. It should set flush_pending to 1 at first. > > V2: > Neil pointed out two problems. One is counting error problem and another is return value > when allocat memory fails. > 1. counting error problem > This isn't safe. It is only safe to call rdev_dec_pending() on rdevs > that you previously called > atomic_inc(&rdev->nr_pending); > If an rdev was added to the list between the start and end of the flush, > this will do something bad. > > Now it doesn't use bio_chain. It uses specified call back function for each > flush bio. > 2. Returned on IO error when kmalloc fails is wrong. > I use mempool suggested by Neil in V2 > 3. Fixed some places pointed by Guoqing > > The test is done on one raid1 with 2 SSDS > > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> > --- > drivers/md/md.c | 184 +++++++++++++++++++++++++++++++++++++++----------------- > drivers/md/md.h | 23 ++++--- > 2 files changed, 145 insertions(+), 62 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 254e44e..dd9038b 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -412,30 +412,84 @@ static int md_congested(void *data, int bits) > /* > * Generic flush handling for md > */ > +static void submit_flushes(struct work_struct *ws) > +{ > + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); > + struct mddev *mddev = fi->mddev; > + struct bio *fbio = fi->fbio; > + > + clear_bit(fi->flush_id, mddev->flush_ids); > + wake_up(&mddev->flush_queue); > + > + fbio->bi_opf &= ~REQ_PREFLUSH; > + md_handle_request(mddev, fbio); > +} > > -static void md_end_flush(struct bio *bio) > +static void rdev_end_flush(struct bio *bi) > { > - struct md_rdev *rdev = bio->bi_private; > - struct mddev *mddev = rdev->mddev; > + struct flush_info *fi = bi->bi_private; > + struct mddev *mddev = fi->mddev; > + struct bio *fbio = fi->fbio; > + struct md_rdev *rdev; > > - rdev_dec_pending(rdev, mddev); > + rcu_read_lock(); > + rdev_for_each_rcu(rdev, mddev) > + if (fi->bios[rdev->raid_disk] == bi) { > + fi->bios[rdev->raid_disk] = NULL; > + rdev_dec_pending(rdev, mddev); > + break; > + } > + rcu_read_unlock(); > > - if (atomic_dec_and_test(&mddev->flush_pending)) { > - /* The pre-request flush has finished */ > - queue_work(md_wq, &mddev->flush_work); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (fbio->bi_iter.bi_size == 0) { > + /* an empty barrier - all done */ > + bio_endio(fbio); > + clear_bit(fi->flush_id, mddev->flush_ids); > + wake_up(&mddev->flush_queue); > + } else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > - bio_put(bio); > -} > > -static void md_submit_flush_data(struct work_struct *ws); > + bio_put(bi); > +} > > -static void submit_flushes(struct work_struct *ws) > +void md_flush_request(struct mddev *mddev, struct bio *bio) > { > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > struct md_rdev *rdev; > + struct flush_info *fi; > + unsigned long fi_bit; > + char *p = (char*)mddev->flush_info; > + > + spin_lock_irq(&mddev->flush_lock); > + fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS); > + if (fi_bit == NR_FLUSHS) { > + DEFINE_WAIT(w); > + for (;;) { > + prepare_to_wait(&mddev->flush_queue, > + &w, TASK_IDLE); > + fi_bit = find_first_zero_bit(mddev->flush_ids, NR_FLUSHS); > + if (fi_bit != NR_FLUSHS) > + break; > + spin_unlock_irq(&mddev->flush_lock); > + schedule(); > + spin_lock_irq(&mddev->flush_lock); > + } > + finish_wait(&mddev->flush_queue, &w); > + } > + > + set_bit(fi_bit, mddev->flush_ids); > + fi = (struct flush_info *)(p + fi_bit * (sizeof(struct flush_info) > + + mddev->raid_disks * sizeof(struct bio*))); > + fi->flush_id = fi_bit; > + spin_unlock_irq(&mddev->flush_lock); > + > + fi->mddev = mddev; > + fi->fbio = bio; > + atomic_set(&fi->flush_pending, 1); > > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > - atomic_set(&mddev->flush_pending, 1); > rcu_read_lock(); > rdev_for_each_rcu(rdev, mddev) > if (rdev->raid_disk >= 0 && > @@ -448,56 +502,34 @@ static void submit_flushes(struct work_struct *ws) > atomic_inc(&rdev->nr_pending); > atomic_inc(&rdev->nr_pending); > rcu_read_unlock(); > + > bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); > - bi->bi_end_io = md_end_flush; > - bi->bi_private = rdev; > bio_set_dev(bi, rdev->bdev); > + bi->bi_end_io = rdev_end_flush; > + bi->bi_private = fi; > bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; > - atomic_inc(&mddev->flush_pending); > + > + fi->bios[rdev->raid_disk] = bi; > + atomic_inc(&fi->flush_pending); > submit_bio(bi); > + > rcu_read_lock(); > rdev_dec_pending(rdev, mddev); > } > rcu_read_unlock(); > - if (atomic_dec_and_test(&mddev->flush_pending)) > - queue_work(md_wq, &mddev->flush_work); > -} > > -static void md_submit_flush_data(struct work_struct *ws) > -{ > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > - struct bio *bio = mddev->flush_bio; > - > - /* > - * must reset flush_bio before calling into md_handle_request to avoid a > - * deadlock, because other bios passed md_handle_request suspend check > - * could wait for this and below md_handle_request could wait for those > - * bios because of suspend check > - */ > - mddev->flush_bio = NULL; > - wake_up(&mddev->sb_wait); > - > - if (bio->bi_iter.bi_size == 0) > - /* an empty barrier - all done */ > - bio_endio(bio); > - else { > - bio->bi_opf &= ~REQ_PREFLUSH; > - md_handle_request(mddev, bio); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (bio->bi_iter.bi_size == 0) { > + /* an empty barrier - all done */ > + bio_endio(bio); > + clear_bit(fi->flush_id, mddev->flush_ids); > + wake_up(&mddev->flush_queue); > + } else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > } > - > -void md_flush_request(struct mddev *mddev, struct bio *bio) > -{ > - spin_lock_irq(&mddev->lock); > - wait_event_lock_irq(mddev->sb_wait, > - !mddev->flush_bio, > - mddev->lock); > - mddev->flush_bio = bio; > - spin_unlock_irq(&mddev->lock); > - > - INIT_WORK(&mddev->flush_work, submit_flushes); > - queue_work(md_wq, &mddev->flush_work); > -} > EXPORT_SYMBOL(md_flush_request); > > static inline struct mddev *mddev_get(struct mddev *mddev) > @@ -555,7 +587,7 @@ void mddev_init(struct mddev *mddev) > atomic_set(&mddev->openers, 0); > atomic_set(&mddev->active_io, 0); > spin_lock_init(&mddev->lock); > - atomic_set(&mddev->flush_pending, 0); > + spin_lock_init(&mddev->flush_lock); > init_waitqueue_head(&mddev->sb_wait); > init_waitqueue_head(&mddev->recovery_wait); > mddev->reshape_position = MaxSector; > @@ -5509,6 +5541,24 @@ int md_run(struct mddev *mddev) > goto abort; > } > } > + if (mddev->flush_info == NULL) { > + mddev->flush_info = kzalloc((sizeof(struct flush_info) + > + sizeof(struct bio*) * mddev->raid_disks) * > + NR_FLUSHS, GFP_KERNEL); > + if (!mddev->flush_info) { > + err = -ENOMEM; > + goto abort; > + } > + } > + if (mddev->flush_ids == NULL) { > + mddev->flush_ids = kzalloc(BITS_TO_LONGS(NR_FLUSHS) * sizeof(unsigned long), GFP_KERNEL); > + if (!mddev->flush_ids) { > + err = -ENOMEM; > + goto abort; > + } > + } > + > + init_waitqueue_head(&mddev->flush_queue); > > spin_lock(&pers_lock); > pers = find_pers(mddev->level, mddev->clevel); > @@ -5668,6 +5718,12 @@ int md_run(struct mddev *mddev) > return 0; > > abort: > + if (mddev->flush_ids) { > + kfree(mddev->flush_ids); > + } > + if (mddev->flush_info) { > + kfree(mddev->flush_info); > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -5888,6 +5944,14 @@ void md_stop(struct mddev *mddev) > * This is called from dm-raid > */ > __md_stop(mddev); > + if (mddev->flush_ids) { > + kfree(mddev->flush_ids); > + mddev->flush_ids = NULL; > + } > + if (mddev->flush_info) { > + kfree(mddev->flush_info); > + mddev->flush_info = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -6856,6 +6920,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) > { > int rv; > struct md_rdev *rdev; > + struct flush_info *new; > /* change the number of raid disks */ > if (mddev->pers->check_reshape == NULL) > return -EINVAL; > @@ -6884,10 +6949,21 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) > else if (mddev->delta_disks > 0) > mddev->reshape_backwards = 0; > > + new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) * > + raid_disks) * NR_FLUSHS, GFP_KERNEL); > + if (!new) > + return -ENOMEM; > + > rv = mddev->pers->check_reshape(mddev); > if (rv < 0) { > mddev->delta_disks = 0; > mddev->reshape_backwards = 0; > + kfree(new); > + } else { > + spin_lock(&mddev->flush_lock); > + kfree(mddev->flush_info); > + mddev->flush_info = new; > + spin_unlock(&mddev->flush_lock); > } > return rv; > } > diff --git a/drivers/md/md.h b/drivers/md/md.h > index fbc925c..83bb62c 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -252,6 +252,16 @@ enum mddev_sb_flags { > MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ > }; > > +#define NR_FLUSHS 16 > +struct flush_info { > + struct mddev *mddev; > + struct work_struct flush_work; > + int flush_id; > + atomic_t flush_pending; > + struct bio *fbio; > + struct bio *bios[0]; > +}; > + > struct mddev { > void *private; > struct md_personality *pers; > @@ -399,7 +409,6 @@ struct mddev { > struct work_struct del_work; /* used for delayed sysfs removal */ > > /* "lock" protects: > - * flush_bio transition from NULL to !NULL > * rdev superblocks, events > * clearing MD_CHANGE_* > * in_sync - and related safemode and MD_CHANGE changes > @@ -457,13 +466,11 @@ struct mddev { > * metadata and bitmap writes > */ > > - /* Generic flush handling. > - * The last to finish preflush schedules a worker to submit > - * the rest of the request (without the REQ_PREFLUSH flag). > - */ > - struct bio *flush_bio; > - atomic_t flush_pending; > - struct work_struct flush_work; > + struct flush_info *flush_info; > + unsigned long *flush_ids; > + wait_queue_head_t flush_queue; > + spinlock_t flush_lock; > + > struct work_struct event_work; /* used by dm to report failure event */ > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > struct md_cluster_info *cluster_info; > -- > 2.7.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html