On Fri, Apr 06, 2018 at 02:56:11PM +0800, Xiao Ni wrote: > There is a lock contention when there are many processes which send flush bios > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. > > Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio > to be NULL, otherwise get mddev->lock. > > This patch remove mddev->flush_bio and handle flush bio asynchronously. > I did a test with command dbench -s 128 -t 300. This is the test result: > > =================Without the patch============================ > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 305893 0.648 49.970 > Close 3204015 0.013 36.673 > LockX 14210 0.003 0.235 > Rename 184914 0.038 25.821 > ReadX 6844043 0.004 3.561 > WriteX 2155550 17.461 84.646 > Unlink 882276 0.152 37.705 > UnlockX 14210 0.002 0.239 > FIND_FIRST 1528913 0.015 21.999 > SET_FILE_INFORMATION 355733 0.007 1.118 > QUERY_FILE_INFORMATION 689672 0.002 0.376 > QUERY_PATH_INFORMATION 3956994 0.006 49.292 > QUERY_FS_INFORMATION 725497 0.004 0.533 > NTCreateX 4363847 0.057 46.578 > > Throughput 453.259 MB/sec (sync open) 128 clients 128 procs max_latency=84.650 m > > =====================With the patch=========================== > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 390444 0.026 3.360 > Close 4089104 0.008 28.907 > LockX 18090 0.003 0.146 > Rename 235907 0.042 19.649 > ReadX 8728080 0.004 1.926 > WriteX 2750498 13.734 65.028 > Unlink 1126388 0.144 33.684 > UnlockX 18090 0.002 0.078 > FIND_FIRST 1951009 0.015 24.111 > SET_FILE_INFORMATION 454013 0.008 1.866 > QUERY_FILE_INFORMATION 879798 0.002 0.746 > QUERY_PATH_INFORMATION 5050540 0.006 41.920 > QUERY_FS_INFORMATION 925775 0.004 0.322 > NTCreateX 5569810 0.043 29.966 > > Throughput 578.195 MB/sec (sync open) 128 clients 128 procs max_latency=65.033 > > V4: use jhash to choose free flush info > V3: > Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device > and uses a simple bitmap to record which resource is free. > > Fix a bug from v2. It should set flush_pending to 1 at first. > > V2: > Neil pointed out two problems. One is counting error problem and another is return value > when allocat memory fails. > 1. counting error problem > This isn't safe. It is only safe to call rdev_dec_pending() on rdevs > that you previously called > atomic_inc(&rdev->nr_pending); > If an rdev was added to the list between the start and end of the flush, > this will do something bad. > > Now it doesn't use bio_chain. It uses specified call back function for each > flush bio. > 2. Returned on IO error when kmalloc fails is wrong. > I use mempool suggested by Neil in V2 > 3. Fixed some places pointed by Guoqing > > The test is done on one raid1 with 2 SSDS > > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> > --- > drivers/md/md.c | 189 +++++++++++++++++++++++++++++++++++++++----------------- > drivers/md/md.h | 23 ++++--- > 2 files changed, 150 insertions(+), 62 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 254e44e..96722fc 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -367,6 +367,7 @@ void mddev_suspend(struct mddev *mddev) > set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); > smp_mb__after_atomic(); > wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); > + wait_event(mddev->sb_wait, atomic_read(&mddev->flush_io) == 0); > mddev->pers->quiesce(mddev, 1); > clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); > wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); > @@ -412,30 +413,78 @@ static int md_congested(void *data, int bits) > /* > * Generic flush handling for md > */ > +static void submit_flushes(struct work_struct *ws) > +{ > + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); > + struct mddev *mddev = fi->mddev; > + struct bio *fbio = fi->fbio; > + > + fi->fbio = NULL; > + atomic_dec(&mddev->flush_io); > + wake_up(&fi->flush_queue); > + wake_up(&mddev->sb_wait); > > -static void md_end_flush(struct bio *bio) > + fbio->bi_opf &= ~REQ_PREFLUSH; > + md_handle_request(mddev, fbio); > +} > + > +static void rdev_end_flush(struct bio *bi) > { > - struct md_rdev *rdev = bio->bi_private; > - struct mddev *mddev = rdev->mddev; > + struct flush_info *fi = bi->bi_private; > + struct mddev *mddev = fi->mddev; > + struct bio *fbio = fi->fbio; > + struct md_rdev *rdev; > > - rdev_dec_pending(rdev, mddev); > + rcu_read_lock(); > + rdev_for_each_rcu(rdev, mddev) > + if (fi->bios[rdev->raid_disk] == bi) { > + fi->bios[rdev->raid_disk] = NULL; > + rdev_dec_pending(rdev, mddev); > + break; > + } > + rcu_read_unlock(); > > - if (atomic_dec_and_test(&mddev->flush_pending)) { > - /* The pre-request flush has finished */ > - queue_work(md_wq, &mddev->flush_work); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (fbio->bi_iter.bi_size == 0) { > + /* an empty barrier - all done */ > + bio_endio(fbio); > + fi->fbio = NULL; > + atomic_dec(&mddev->flush_io); > + wake_up(&fi->flush_queue); > + wake_up(&mddev->sb_wait); > + } else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > - bio_put(bio); > -} > > -static void md_submit_flush_data(struct work_struct *ws); > + bio_put(bi); > +} > > -static void submit_flushes(struct work_struct *ws) > +void md_flush_request(struct mddev *mddev, struct bio *fbio) > { > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > struct md_rdev *rdev; > + struct flush_info *fi; > + char *p = (char*)mddev->flush_info; > + int index; > + > + atomic_inc(&mddev->flush_io); > + > + index = jhash((void*)&fbio->bi_iter.bi_sector, > + sizeof(sector_t), 0) % NR_FLUSHS; If this is a pure flush request, the bi_sector is meaningless. so probably we could just use fbio to hash. > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > + + mddev->raid_disks * sizeof(struct bio*))); > + > + spin_lock_irq(&fi->flush_lock); > + wait_event_lock_irq(fi->flush_queue, > + !fi->fbio, > + fi->flush_lock); > + fi->fbio = fbio; > + spin_unlock_irq(&fi->flush_lock); > + > + fi->mddev = mddev; > + atomic_set(&fi->flush_pending, 1); > > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > - atomic_set(&mddev->flush_pending, 1); > rcu_read_lock(); > rdev_for_each_rcu(rdev, mddev) > if (rdev->raid_disk >= 0 && > @@ -448,56 +497,36 @@ static void submit_flushes(struct work_struct *ws) > atomic_inc(&rdev->nr_pending); > atomic_inc(&rdev->nr_pending); > rcu_read_unlock(); > + > bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); > - bi->bi_end_io = md_end_flush; > - bi->bi_private = rdev; > + bi->bi_end_io = rdev_end_flush; > + bi->bi_private = fi; > bio_set_dev(bi, rdev->bdev); > bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; > - atomic_inc(&mddev->flush_pending); > + > + fi->bios[rdev->raid_disk] = bi; > + atomic_inc(&fi->flush_pending); > submit_bio(bi); > + > rcu_read_lock(); > rdev_dec_pending(rdev, mddev); > } > rcu_read_unlock(); > - if (atomic_dec_and_test(&mddev->flush_pending)) > - queue_work(md_wq, &mddev->flush_work); > -} > > -static void md_submit_flush_data(struct work_struct *ws) > -{ > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > - struct bio *bio = mddev->flush_bio; > - > - /* > - * must reset flush_bio before calling into md_handle_request to avoid a > - * deadlock, because other bios passed md_handle_request suspend check > - * could wait for this and below md_handle_request could wait for those > - * bios because of suspend check > - */ > - mddev->flush_bio = NULL; > - wake_up(&mddev->sb_wait); > - > - if (bio->bi_iter.bi_size == 0) > - /* an empty barrier - all done */ > - bio_endio(bio); > - else { > - bio->bi_opf &= ~REQ_PREFLUSH; > - md_handle_request(mddev, bio); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (fbio->bi_iter.bi_size == 0) { > + /* an empty barrier - all done */ > + bio_endio(fbio); > + fi->fbio = NULL; > + atomic_dec(&mddev->flush_io); > + wake_up(&fi->flush_queue); > + wake_up(&mddev->sb_wait); > + } else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > } > - > -void md_flush_request(struct mddev *mddev, struct bio *bio) > -{ > - spin_lock_irq(&mddev->lock); > - wait_event_lock_irq(mddev->sb_wait, > - !mddev->flush_bio, > - mddev->lock); > - mddev->flush_bio = bio; > - spin_unlock_irq(&mddev->lock); > - > - INIT_WORK(&mddev->flush_work, submit_flushes); > - queue_work(md_wq, &mddev->flush_work); > -} > EXPORT_SYMBOL(md_flush_request); > > static inline struct mddev *mddev_get(struct mddev *mddev) > @@ -555,7 +584,6 @@ void mddev_init(struct mddev *mddev) > atomic_set(&mddev->openers, 0); > atomic_set(&mddev->active_io, 0); > spin_lock_init(&mddev->lock); > - atomic_set(&mddev->flush_pending, 0); > init_waitqueue_head(&mddev->sb_wait); > init_waitqueue_head(&mddev->recovery_wait); > mddev->reshape_position = MaxSector; > @@ -5509,6 +5537,27 @@ int md_run(struct mddev *mddev) > goto abort; > } > } > + if (mddev->flush_info == NULL) { > + int index = 0; > + char *p; > + struct flush_info *fi; > + mddev->flush_info = kzalloc((sizeof(struct flush_info) + > + sizeof(struct bio*) * mddev->raid_disks) * > + NR_FLUSHS, GFP_KERNEL); > + if (!mddev->flush_info) { > + err = -ENOMEM; > + goto abort; > + } > + > + p = (char*)mddev->flush_info; > + while (index < NR_FLUSHS) { > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > + + mddev->raid_disks * sizeof(struct bio*))); > + spin_lock_init(&fi->flush_lock); > + init_waitqueue_head(&fi->flush_queue); > + index++; > + } > + } > > spin_lock(&pers_lock); > pers = find_pers(mddev->level, mddev->clevel); > @@ -5668,6 +5717,9 @@ int md_run(struct mddev *mddev) > return 0; > > abort: > + if (mddev->flush_info) { > + kfree(mddev->flush_info); > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -5888,6 +5940,10 @@ void md_stop(struct mddev *mddev) > * This is called from dm-raid > */ > __md_stop(mddev); > + if (mddev->flush_info) { > + kfree(mddev->flush_info); > + mddev->flush_info = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -6854,8 +6910,10 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) > > static int update_raid_disks(struct mddev *mddev, int raid_disks) > { > - int rv; > + int rv, index; > struct md_rdev *rdev; > + struct flush_info *new, *fi; > + char *p; > /* change the number of raid disks */ > if (mddev->pers->check_reshape == NULL) > return -EINVAL; > @@ -6884,10 +6942,31 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) > else if (mddev->delta_disks > 0) > mddev->reshape_backwards = 0; > > + new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) * > + raid_disks) * NR_FLUSHS, GFP_KERNEL); > + if (!new) > + return -ENOMEM; > + > + p = (char*)new; > + index = 0; > + while (index < NR_FLUSHS) { > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > + + raid_disks * sizeof(struct bio*))); > + spin_lock_init(&fi->flush_lock); > + init_waitqueue_head(&fi->flush_queue); > + index++; > + } > + > rv = mddev->pers->check_reshape(mddev); > if (rv < 0) { > mddev->delta_disks = 0; > mddev->reshape_backwards = 0; > + kfree(new); > + } else { > + mddev_suspend(mddev); > + kfree(mddev->flush_info); > + mddev->flush_info = new; > + mddev_resume(mddev); > } > return rv; > } > diff --git a/drivers/md/md.h b/drivers/md/md.h > index fbc925c..24f22f5 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -252,6 +252,17 @@ enum mddev_sb_flags { > MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ > }; > > +#define NR_FLUSHS 16 > +struct flush_info { > + struct mddev *mddev; > + struct work_struct flush_work; > + atomic_t flush_pending; > + spinlock_t flush_lock; > + wait_queue_head_t flush_queue; > + struct bio *fbio; > + struct bio *bios[0]; > +}; > + > struct mddev { > void *private; > struct md_personality *pers; > @@ -399,7 +410,6 @@ struct mddev { > struct work_struct del_work; /* used for delayed sysfs removal */ > > /* "lock" protects: > - * flush_bio transition from NULL to !NULL > * rdev superblocks, events > * clearing MD_CHANGE_* > * in_sync - and related safemode and MD_CHANGE changes > @@ -457,13 +467,12 @@ struct mddev { > * metadata and bitmap writes > */ > > - /* Generic flush handling. > - * The last to finish preflush schedules a worker to submit > - * the rest of the request (without the REQ_PREFLUSH flag). > + /* > + * Generic flush handling. > */ > - struct bio *flush_bio; > - atomic_t flush_pending; > - struct work_struct flush_work; > + struct flush_info *flush_info; > + atomic_t flush_io; > + > struct work_struct event_work; /* used by dm to report failure event */ > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > struct md_cluster_info *cluster_info; > -- > 2.7.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html