On Wed, Feb 21, 2018 at 12:56:48PM +0800, Xiao Ni wrote: > There is a lock contention when there are many processes which send flush bios > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. > > Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio > to be NULL, otherwise get mddev->lock. > > This patch remove mddev->flush_bio and handle flush bio asynchronously. > I did a test with command dbench -s 128 -t 4800. This is the test result: > > =================Without the patch============================ > Operation Count AvgLat MaxLat > ---------------------------------------- > NTCreateX 129643 4.792 13249.555 > Close 101064 0.884 12500.318 > Rename 5498 2.364 8250.734 > Unlink 21305 6.728 9749.328 > Qpathinfo 118012 0.041 999.884 > Qfileinfo 26676 0.004 0.261 > Qfsinfo 19712 0.009 0.490 > Sfileinfo 12247 0.014 0.614 > Find 43408 0.025 0.528 > WriteX 90174 6622.484 66249.763 > ReadX 194811 0.022 2748.778 > LockX 392 0.007 0.041 > UnlockX 392 0.003 0.019 > Flush 9970 1694.938 37749.263 > > Throughput 1.05396 MB/sec (sync open) 128 clients 128 procs max_latency=66249.775 ms > > =====================With the patch=========================== > Operation Count AvgLat MaxLat > ---------------------------------------- > NTCreateX 903706 3.345 4500.722 > Close 667882 0.128 4500.736 > Rename 38387 0.566 750.312 > Unlink 178731 13.415 5249.599 > Qpathinfo 824087 1.129 4500.394 > Qfileinfo 145055 0.004 1.010 > Qfsinfo 147967 0.011 1.565 > Sfileinfo 74359 0.023 1.439 > Find 315728 0.096 2748.048 > WriteX 447580 1303.313 4000.197 > ReadX 1420065 0.010 3.587 > LockX 3012 0.006 0.415 > UnlockX 3012 0.004 0.067 > Flush 62019 398.955 1251.283 > > Throughput 5.8709 MB/sec (sync open) 128 clients 128 procs max_latency=5249.611 ms > > V2: > Neil pointed out two problems. One is counting error problem and another is return value > when allocat memory fails. > 1. counting error problem > This isn't safe. It is only safe to call rdev_dec_pending() on rdevs > that you previously called > atomic_inc(&rdev->nr_pending); > If an rdev was added to the list between the start and end of the flush, > this will do something bad. > > Now it doesn't use bio_chain. It uses specified call back function for each > flush bio. > 2. Returned on IO error when kmalloc fails is wrong. > I use mempool suggested by Neil in V2 > 3. Fixed some places pointed by Guoqing > > The test was did on one raid10 with 4 SSDs. > > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> > --- > drivers/md/md.c | 152 ++++++++++++++++++++++++++++++++++++-------------------- > drivers/md/md.h | 22 +++++--- > 2 files changed, 112 insertions(+), 62 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index 0a9ed51..60c2c17 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev) > mddev->sync_speed_max : sysctl_speed_limit_max; > } > > +static void * flush_info_alloc(gfp_t gfp_flags, void *data) > +{ > + return kzalloc(sizeof(struct flush_info), gfp_flags); > +} > +static void flush_info_free(void *flush_info, void *data) > +{ > + kfree(flush_info); > +} > + > +static void * flush_bio_alloc(gfp_t gfp_flags, void *data) > +{ > + return kzalloc(sizeof(struct flush_bio), gfp_flags); > +} > +static void flush_bio_free(void *flush_bio, void *data) > +{ > + kfree(flush_bio); > +} > + A mempool sounds an overkill here. You could simply statically allocate an array of current data structure and let each flush bio hash to one of the array. I bet that will only change a few lines of current code and fix the lock contention. > static struct ctl_table_header *raid_table_header; > > static struct ctl_table raid_table[] = { > @@ -412,30 +430,53 @@ static int md_congested(void *data, int bits) > /* > * Generic flush handling for md > */ > +static void submit_flushes(struct work_struct *ws) > +{ > + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); > + struct mddev *mddev = fi->mddev; > + struct bio *bio = fi->bio; > + > + bio->bi_opf &= ~REQ_PREFLUSH; > + md_handle_request(mddev, bio); > + > + mempool_free(fi, mddev->flush_pool); > +} > > -static void md_end_flush(struct bio *bio) > +static void md_end_flush(struct bio *fbio) > { > - struct md_rdev *rdev = bio->bi_private; > - struct mddev *mddev = rdev->mddev; > + struct flush_bio *fb = fbio->bi_private; > + struct md_rdev *rdev = fb->rdev; > + struct flush_info *fi = fb->fi; > + struct bio *bio = fi->bio; > + struct mddev *mddev = fi->mddev; > > rdev_dec_pending(rdev, mddev); > > - if (atomic_dec_and_test(&mddev->flush_pending)) { > - /* The pre-request flush has finished */ > - queue_work(md_wq, &mddev->flush_work); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (bio->bi_iter.bi_size == 0) > + /* an empty barrier - all done */ > + bio_endio(bio); > + else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > - bio_put(bio); > -} > > -static void md_submit_flush_data(struct work_struct *ws); > + mempool_free(fb, mddev->flush_bio_pool); > + bio_put(fbio); > +} > > -static void submit_flushes(struct work_struct *ws) > +void md_flush_request(struct mddev *mddev, struct bio *bio) > { > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > struct md_rdev *rdev; > + struct flush_info *fi; > + > + fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); > + > + fi->bio = bio; > + fi->mddev = mddev; > + atomic_set(&fi->flush_pending, 0); > > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > - atomic_set(&mddev->flush_pending, 1); > rcu_read_lock(); > rdev_for_each_rcu(rdev, mddev) > if (rdev->raid_disk >= 0 && > @@ -445,58 +486,28 @@ static void submit_flushes(struct work_struct *ws) > * we reclaim rcu_read_lock > */ > struct bio *bi; > + struct flush_bio *fb; > atomic_inc(&rdev->nr_pending); > atomic_inc(&rdev->nr_pending); > rcu_read_unlock(); > + > + fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); > + fb->fi = fi; > + fb->rdev = rdev; > + > bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); > - bi->bi_end_io = md_end_flush; > - bi->bi_private = rdev; > bio_set_dev(bi, rdev->bdev); > + bi->bi_end_io = md_end_flush; > + bi->bi_private = fb; > bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; > - atomic_inc(&mddev->flush_pending); > + > + atomic_inc(&fi->flush_pending); > submit_bio(bi); you set flush_pending to 0, which is wrong. after submit_bio, md_end_flush could be called immediately, before other rdevs have chance to this loop. Thanks, Shaohua > + > rcu_read_lock(); > rdev_dec_pending(rdev, mddev); > } > rcu_read_unlock(); > - if (atomic_dec_and_test(&mddev->flush_pending)) > - queue_work(md_wq, &mddev->flush_work); > -} > - > -static void md_submit_flush_data(struct work_struct *ws) > -{ > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > - struct bio *bio = mddev->flush_bio; > - > - /* > - * must reset flush_bio before calling into md_handle_request to avoid a > - * deadlock, because other bios passed md_handle_request suspend check > - * could wait for this and below md_handle_request could wait for those > - * bios because of suspend check > - */ > - mddev->flush_bio = NULL; > - wake_up(&mddev->sb_wait); > - > - if (bio->bi_iter.bi_size == 0) > - /* an empty barrier - all done */ > - bio_endio(bio); > - else { > - bio->bi_opf &= ~REQ_PREFLUSH; > - md_handle_request(mddev, bio); > - } > -} > - > -void md_flush_request(struct mddev *mddev, struct bio *bio) > -{ > - spin_lock_irq(&mddev->lock); > - wait_event_lock_irq(mddev->sb_wait, > - !mddev->flush_bio, > - mddev->lock); > - mddev->flush_bio = bio; > - spin_unlock_irq(&mddev->lock); > - > - INIT_WORK(&mddev->flush_work, submit_flushes); > - queue_work(md_wq, &mddev->flush_work); > } > EXPORT_SYMBOL(md_flush_request); > > @@ -555,7 +566,6 @@ void mddev_init(struct mddev *mddev) > atomic_set(&mddev->openers, 0); > atomic_set(&mddev->active_io, 0); > spin_lock_init(&mddev->lock); > - atomic_set(&mddev->flush_pending, 0); > init_waitqueue_head(&mddev->sb_wait); > init_waitqueue_head(&mddev->recovery_wait); > mddev->reshape_position = MaxSector; > @@ -5501,6 +5511,22 @@ int md_run(struct mddev *mddev) > goto abort; > } > } > + if (mddev->flush_pool == NULL) { > + mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, > + flush_info_free, mddev); > + if (!mddev->flush_pool) { > + err = -ENOMEM; > + goto abort; > + } > + } > + if (mddev->flush_bio_pool == NULL) { > + mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, > + flush_bio_free, mddev); > + if (!mddev->flush_bio_pool) { > + err = -ENOMEM; > + goto abort; > + } > + } > > spin_lock(&pers_lock); > pers = find_pers(mddev->level, mddev->clevel); > @@ -5665,6 +5691,14 @@ int md_run(struct mddev *mddev) > return 0; > > abort: > + if (mddev->flush_bio_pool) { > + mempool_destroy(mddev->flush_bio_pool); > + mddev->flush_bio_pool = NULL; > + } > + if (mddev->flush_pool){ > + mempool_destroy(mddev->flush_pool); > + mddev->flush_pool = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -5867,6 +5901,14 @@ void md_stop(struct mddev *mddev) > * This is called from dm-raid > */ > __md_stop(mddev); > + if (mddev->flush_bio_pool) { > + mempool_destroy(mddev->flush_bio_pool); > + mddev->flush_bio_pool = NULL; > + } > + if (mddev->flush_pool) { > + mempool_destroy(mddev->flush_pool); > + mddev->flush_pool = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > diff --git a/drivers/md/md.h b/drivers/md/md.h > index 7d6bcf0..1e40824 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -252,6 +252,19 @@ enum mddev_sb_flags { > MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ > }; > > +#define NR_FLUSH_INFOS 8 > +#define NR_FLUSH_BIOS 64 > +struct flush_info { > + struct bio *bio; > + struct mddev *mddev; > + struct work_struct flush_work; > + atomic_t flush_pending; > +}; > +struct flush_bio { > + struct flush_info *fi; > + struct md_rdev *rdev; > +}; > + > struct mddev { > void *private; > struct md_personality *pers; > @@ -457,13 +470,8 @@ struct mddev { > * metadata and bitmap writes > */ > > - /* Generic flush handling. > - * The last to finish preflush schedules a worker to submit > - * the rest of the request (without the REQ_PREFLUSH flag). > - */ > - struct bio *flush_bio; > - atomic_t flush_pending; > - struct work_struct flush_work; > + mempool_t *flush_pool; > + mempool_t *flush_bio_pool; > struct work_struct event_work; /* used by dm to report failure event */ > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > struct md_cluster_info *cluster_info; > -- > 2.7.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html