On Mon, May 21, 2018 at 11:49:54AM +0800, Xiao Ni wrote: > There is a lock contention when there are many processes which send flush bios > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each lv. > > Now it just can handle flush request sequentially. It needs to wait mddev->flush_bio > to be NULL, otherwise get mddev->lock. > > This patch remove mddev->flush_bio and handle flush bio asynchronously. > I did a test with command dbench -s 128 -t 300. This is the test result: > > =================Without the patch============================ > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 11165 167.595 5879.560 > Close 107469 1.391 2231.094 > LockX 384 0.003 0.019 > Rename 5944 2.141 1856.001 > ReadX 208121 0.003 0.074 > WriteX 98259 1925.402 15204.895 > Unlink 25198 13.264 3457.268 > UnlockX 384 0.001 0.009 > FIND_FIRST 47111 0.012 0.076 > SET_FILE_INFORMATION 12966 0.007 0.065 > QUERY_FILE_INFORMATION 27921 0.004 0.085 > QUERY_PATH_INFORMATION 124650 0.005 5.766 > QUERY_FS_INFORMATION 22519 0.003 0.053 > NTCreateX 141086 4.291 2502.812 > > Throughput 3.7181 MB/sec (sync open) 128 clients 128 procs max_latency=15204.905 ms > > =================With the patch============================ > Operation Count AvgLat MaxLat > -------------------------------------------------- > Flush 4500 174.134 406.398 > Close 48195 0.060 467.062 > LockX 256 0.003 0.029 > Rename 2324 0.026 0.360 > ReadX 78846 0.004 0.504 > WriteX 66832 562.775 1467.037 > Unlink 5516 3.665 1141.740 > UnlockX 256 0.002 0.019 > FIND_FIRST 16428 0.015 0.313 > SET_FILE_INFORMATION 6400 0.009 0.520 > QUERY_FILE_INFORMATION 17865 0.003 0.089 > QUERY_PATH_INFORMATION 47060 0.078 416.299 > QUERY_FS_INFORMATION 7024 0.004 0.032 > NTCreateX 55921 0.854 1141.452 > > Throughput 11.744 MB/sec (sync open) 128 clients 128 procs max_latency=1467.041 ms > > The test is done on raid1 disk with two rotational disks > > V5: V4 is more complicated than the version with memory pool. So revert to the memory pool > version > > V4: use address of fbio to do hash to choose free flush info. > V3: > Shaohua suggests mempool is overkill. In v3 it allocs memory during creating raid device > and uses a simple bitmap to record which resource is free. > > Fix a bug from v2. It should set flush_pending to 1 at first. > > V2: > Neil pointed out two problems. One is counting error problem and another is return value > when allocat memory fails. > 1. counting error problem > This isn't safe. It is only safe to call rdev_dec_pending() on rdevs > that you previously called > atomic_inc(&rdev->nr_pending); > If an rdev was added to the list between the start and end of the flush, > this will do something bad. > > Now it doesn't use bio_chain. It uses specified call back function for each > flush bio. > 2. Returned on IO error when kmalloc fails is wrong. > I use mempool suggested by Neil in V2 > 3. Fixed some places pointed by Guoqing > > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> Ok, replaced with this one, thanks! > --- > drivers/md/md.c | 158 +++++++++++++++++++++++++++++++++++++------------------- > drivers/md/md.h | 22 +++++--- > 2 files changed, 120 insertions(+), 60 deletions(-) > > diff --git a/drivers/md/md.c b/drivers/md/md.c > index c208c01..753e0ca 100644 > --- a/drivers/md/md.c > +++ b/drivers/md/md.c > @@ -130,6 +130,24 @@ static inline int speed_max(struct mddev *mddev) > mddev->sync_speed_max : sysctl_speed_limit_max; > } > > +static void * flush_info_alloc(gfp_t gfp_flags, void *data) > +{ > + return kzalloc(sizeof(struct flush_info), gfp_flags); > +} > +static void flush_info_free(void *flush_info, void *data) > +{ > + kfree(flush_info); > +} > + > +static void * flush_bio_alloc(gfp_t gfp_flags, void *data) > +{ > + return kzalloc(sizeof(struct flush_bio), gfp_flags); > +} > +static void flush_bio_free(void *flush_bio, void *data) > +{ > + kfree(flush_bio); > +} > + > static struct ctl_table_header *raid_table_header; > > static struct ctl_table raid_table[] = { > @@ -412,30 +430,53 @@ static int md_congested(void *data, int bits) > /* > * Generic flush handling for md > */ > +static void submit_flushes(struct work_struct *ws) > +{ > + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); > + struct mddev *mddev = fi->mddev; > + struct bio *bio = fi->bio; > + > + bio->bi_opf &= ~REQ_PREFLUSH; > + md_handle_request(mddev, bio); > + > + mempool_free(fi, mddev->flush_pool); > +} > > -static void md_end_flush(struct bio *bio) > +static void md_end_flush(struct bio *fbio) > { > - struct md_rdev *rdev = bio->bi_private; > - struct mddev *mddev = rdev->mddev; > + struct flush_bio *fb = fbio->bi_private; > + struct md_rdev *rdev = fb->rdev; > + struct flush_info *fi = fb->fi; > + struct bio *bio = fi->bio; > + struct mddev *mddev = fi->mddev; > > rdev_dec_pending(rdev, mddev); > > - if (atomic_dec_and_test(&mddev->flush_pending)) { > - /* The pre-request flush has finished */ > - queue_work(md_wq, &mddev->flush_work); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (bio->bi_iter.bi_size == 0) > + /* an empty barrier - all done */ > + bio_endio(bio); > + else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > - bio_put(bio); > -} > > -static void md_submit_flush_data(struct work_struct *ws); > + mempool_free(fb, mddev->flush_bio_pool); > + bio_put(fbio); > +} > > -static void submit_flushes(struct work_struct *ws) > +void md_flush_request(struct mddev *mddev, struct bio *bio) > { > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > struct md_rdev *rdev; > + struct flush_info *fi; > + > + fi = mempool_alloc(mddev->flush_pool, GFP_NOIO); > + > + fi->bio = bio; > + fi->mddev = mddev; > + atomic_set(&fi->flush_pending, 1); > > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > - atomic_set(&mddev->flush_pending, 1); > rcu_read_lock(); > rdev_for_each_rcu(rdev, mddev) > if (rdev->raid_disk >= 0 && > @@ -445,59 +486,39 @@ static void submit_flushes(struct work_struct *ws) > * we reclaim rcu_read_lock > */ > struct bio *bi; > + struct flush_bio *fb; > atomic_inc(&rdev->nr_pending); > atomic_inc(&rdev->nr_pending); > rcu_read_unlock(); > + > + fb = mempool_alloc(mddev->flush_bio_pool, GFP_NOIO); > + fb->fi = fi; > + fb->rdev = rdev; > + > bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); > - bi->bi_end_io = md_end_flush; > - bi->bi_private = rdev; > bio_set_dev(bi, rdev->bdev); > + bi->bi_end_io = md_end_flush; > + bi->bi_private = fb; > bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; > - atomic_inc(&mddev->flush_pending); > + > + atomic_inc(&fi->flush_pending); > submit_bio(bi); > + > rcu_read_lock(); > rdev_dec_pending(rdev, mddev); > } > rcu_read_unlock(); > - if (atomic_dec_and_test(&mddev->flush_pending)) > - queue_work(md_wq, &mddev->flush_work); > -} > - > -static void md_submit_flush_data(struct work_struct *ws) > -{ > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > - struct bio *bio = mddev->flush_bio; > > - /* > - * must reset flush_bio before calling into md_handle_request to avoid a > - * deadlock, because other bios passed md_handle_request suspend check > - * could wait for this and below md_handle_request could wait for those > - * bios because of suspend check > - */ > - mddev->flush_bio = NULL; > - wake_up(&mddev->sb_wait); > - > - if (bio->bi_iter.bi_size == 0) > - /* an empty barrier - all done */ > - bio_endio(bio); > - else { > - bio->bi_opf &= ~REQ_PREFLUSH; > - md_handle_request(mddev, bio); > + if (atomic_dec_and_test(&fi->flush_pending)) { > + if (bio->bi_iter.bi_size == 0) > + /* an empty barrier - all done */ > + bio_endio(bio); > + else { > + INIT_WORK(&fi->flush_work, submit_flushes); > + queue_work(md_wq, &fi->flush_work); > + } > } > } > - > -void md_flush_request(struct mddev *mddev, struct bio *bio) > -{ > - spin_lock_irq(&mddev->lock); > - wait_event_lock_irq(mddev->sb_wait, > - !mddev->flush_bio, > - mddev->lock); > - mddev->flush_bio = bio; > - spin_unlock_irq(&mddev->lock); > - > - INIT_WORK(&mddev->flush_work, submit_flushes); > - queue_work(md_wq, &mddev->flush_work); > -} > EXPORT_SYMBOL(md_flush_request); > > static inline struct mddev *mddev_get(struct mddev *mddev) > @@ -555,7 +576,6 @@ void mddev_init(struct mddev *mddev) > atomic_set(&mddev->openers, 0); > atomic_set(&mddev->active_io, 0); > spin_lock_init(&mddev->lock); > - atomic_set(&mddev->flush_pending, 0); > init_waitqueue_head(&mddev->sb_wait); > init_waitqueue_head(&mddev->recovery_wait); > mddev->reshape_position = MaxSector; > @@ -5509,6 +5529,22 @@ int md_run(struct mddev *mddev) > goto abort; > } > } > + if (mddev->flush_pool == NULL) { > + mddev->flush_pool = mempool_create(NR_FLUSH_INFOS, flush_info_alloc, > + flush_info_free, mddev); > + if (!mddev->flush_pool) { > + err = -ENOMEM; > + goto abort; > + } > + } > + if (mddev->flush_bio_pool == NULL) { > + mddev->flush_bio_pool = mempool_create(NR_FLUSH_BIOS, flush_bio_alloc, > + flush_bio_free, mddev); > + if (!mddev->flush_bio_pool) { > + err = -ENOMEM; > + goto abort; > + } > + } > > spin_lock(&pers_lock); > pers = find_pers(mddev->level, mddev->clevel); > @@ -5668,6 +5704,14 @@ int md_run(struct mddev *mddev) > return 0; > > abort: > + if (mddev->flush_bio_pool) { > + mempool_destroy(mddev->flush_bio_pool); > + mddev->flush_bio_pool = NULL; > + } > + if (mddev->flush_pool){ > + mempool_destroy(mddev->flush_pool); > + mddev->flush_pool = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > @@ -5888,6 +5932,14 @@ void md_stop(struct mddev *mddev) > * This is called from dm-raid > */ > __md_stop(mddev); > + if (mddev->flush_bio_pool) { > + mempool_destroy(mddev->flush_bio_pool); > + mddev->flush_bio_pool = NULL; > + } > + if (mddev->flush_pool) { > + mempool_destroy(mddev->flush_pool); > + mddev->flush_pool = NULL; > + } > if (mddev->bio_set) { > bioset_free(mddev->bio_set); > mddev->bio_set = NULL; > diff --git a/drivers/md/md.h b/drivers/md/md.h > index fbc925c..ffcb1ae 100644 > --- a/drivers/md/md.h > +++ b/drivers/md/md.h > @@ -252,6 +252,19 @@ enum mddev_sb_flags { > MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ > }; > > +#define NR_FLUSH_INFOS 8 > +#define NR_FLUSH_BIOS 64 > +struct flush_info { > + struct bio *bio; > + struct mddev *mddev; > + struct work_struct flush_work; > + atomic_t flush_pending; > +}; > +struct flush_bio { > + struct flush_info *fi; > + struct md_rdev *rdev; > +}; > + > struct mddev { > void *private; > struct md_personality *pers; > @@ -457,13 +470,8 @@ struct mddev { > * metadata and bitmap writes > */ > > - /* Generic flush handling. > - * The last to finish preflush schedules a worker to submit > - * the rest of the request (without the REQ_PREFLUSH flag). > - */ > - struct bio *flush_bio; > - atomic_t flush_pending; > - struct work_struct flush_work; > + mempool_t *flush_pool; > + mempool_t *flush_bio_pool; > struct work_struct event_work; /* used by dm to report failure event */ > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > struct md_cluster_info *cluster_info; > -- > 2.7.4 > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html