----- Original Message ----- > From: "Shaohua Li" <shli@xxxxxxxxxx> > To: "Xiao Ni" <xni@xxxxxxxxxx> > Cc: linux-raid@xxxxxxxxxxxxxxx, "ming lei" <ming.lei@xxxxxxxxxx>, ncroxon@xxxxxxxxxx, neilb@xxxxxxxx > Sent: Monday, April 9, 2018 11:52:20 PM > Subject: Re: [PATCH V4 1/1] MD: fix lock contention for flush bios > > On Fri, Apr 06, 2018 at 02:56:11PM +0800, Xiao Ni wrote: > > There is a lock contention when there are many processes which send flush > > bios > > to md device. eg. Create many lvs on one raid device and mkfs.xfs on each > > lv. > > > > Now it just can handle flush request sequentially. It needs to wait > > mddev->flush_bio > > to be NULL, otherwise get mddev->lock. > > > > This patch remove mddev->flush_bio and handle flush bio asynchronously. > > I did a test with command dbench -s 128 -t 300. This is the test result: > > > > =================Without the patch============================ > > Operation Count AvgLat MaxLat > > -------------------------------------------------- > > Flush 305893 0.648 49.970 > > Close 3204015 0.013 36.673 > > LockX 14210 0.003 0.235 > > Rename 184914 0.038 25.821 > > ReadX 6844043 0.004 3.561 > > WriteX 2155550 17.461 84.646 > > Unlink 882276 0.152 37.705 > > UnlockX 14210 0.002 0.239 > > FIND_FIRST 1528913 0.015 21.999 > > SET_FILE_INFORMATION 355733 0.007 1.118 > > QUERY_FILE_INFORMATION 689672 0.002 0.376 > > QUERY_PATH_INFORMATION 3956994 0.006 49.292 > > QUERY_FS_INFORMATION 725497 0.004 0.533 > > NTCreateX 4363847 0.057 46.578 > > > > Throughput 453.259 MB/sec (sync open) 128 clients 128 procs > > max_latency=84.650 m > > > > =====================With the patch=========================== > > Operation Count AvgLat MaxLat > > -------------------------------------------------- > > Flush 390444 0.026 3.360 > > Close 4089104 0.008 28.907 > > LockX 18090 0.003 0.146 > > Rename 235907 0.042 19.649 > > ReadX 8728080 0.004 1.926 > > WriteX 2750498 13.734 65.028 > > Unlink 1126388 0.144 33.684 > > UnlockX 18090 0.002 0.078 > > FIND_FIRST 1951009 0.015 24.111 > > SET_FILE_INFORMATION 454013 0.008 1.866 > > QUERY_FILE_INFORMATION 879798 0.002 0.746 > > QUERY_PATH_INFORMATION 5050540 0.006 41.920 > > QUERY_FS_INFORMATION 925775 0.004 0.322 > > NTCreateX 5569810 0.043 29.966 > > > > Throughput 578.195 MB/sec (sync open) 128 clients 128 procs > > max_latency=65.033 > > > > V4: use jhash to choose free flush info > > V3: > > Shaohua suggests mempool is overkill. In v3 it allocs memory during > > creating raid device > > and uses a simple bitmap to record which resource is free. > > > > Fix a bug from v2. It should set flush_pending to 1 at first. > > > > V2: > > Neil pointed out two problems. One is counting error problem and another is > > return value > > when allocat memory fails. > > 1. counting error problem > > This isn't safe. It is only safe to call rdev_dec_pending() on rdevs > > that you previously called > > atomic_inc(&rdev->nr_pending); > > If an rdev was added to the list between the start and end of the flush, > > this will do something bad. > > > > Now it doesn't use bio_chain. It uses specified call back function for each > > flush bio. > > 2. Returned on IO error when kmalloc fails is wrong. > > I use mempool suggested by Neil in V2 > > 3. Fixed some places pointed by Guoqing > > > > The test is done on one raid1 with 2 SSDS > > > > Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx> > > Signed-off-by: Xiao Ni <xni@xxxxxxxxxx> > > --- > > drivers/md/md.c | 189 > > +++++++++++++++++++++++++++++++++++++++----------------- > > drivers/md/md.h | 23 ++++--- > > 2 files changed, 150 insertions(+), 62 deletions(-) > > > > diff --git a/drivers/md/md.c b/drivers/md/md.c > > index 254e44e..96722fc 100644 > > --- a/drivers/md/md.c > > +++ b/drivers/md/md.c > > @@ -367,6 +367,7 @@ void mddev_suspend(struct mddev *mddev) > > set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags); > > smp_mb__after_atomic(); > > wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); > > + wait_event(mddev->sb_wait, atomic_read(&mddev->flush_io) == 0); > > mddev->pers->quiesce(mddev, 1); > > clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags); > > wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags)); > > @@ -412,30 +413,78 @@ static int md_congested(void *data, int bits) > > /* > > * Generic flush handling for md > > */ > > +static void submit_flushes(struct work_struct *ws) > > +{ > > + struct flush_info *fi = container_of(ws, struct flush_info, flush_work); > > + struct mddev *mddev = fi->mddev; > > + struct bio *fbio = fi->fbio; > > + > > + fi->fbio = NULL; > > + atomic_dec(&mddev->flush_io); > > + wake_up(&fi->flush_queue); > > + wake_up(&mddev->sb_wait); > > > > -static void md_end_flush(struct bio *bio) > > + fbio->bi_opf &= ~REQ_PREFLUSH; > > + md_handle_request(mddev, fbio); > > +} > > + > > +static void rdev_end_flush(struct bio *bi) > > { > > - struct md_rdev *rdev = bio->bi_private; > > - struct mddev *mddev = rdev->mddev; > > + struct flush_info *fi = bi->bi_private; > > + struct mddev *mddev = fi->mddev; > > + struct bio *fbio = fi->fbio; > > + struct md_rdev *rdev; > > > > - rdev_dec_pending(rdev, mddev); > > + rcu_read_lock(); > > + rdev_for_each_rcu(rdev, mddev) > > + if (fi->bios[rdev->raid_disk] == bi) { > > + fi->bios[rdev->raid_disk] = NULL; > > + rdev_dec_pending(rdev, mddev); > > + break; > > + } > > + rcu_read_unlock(); > > > > - if (atomic_dec_and_test(&mddev->flush_pending)) { > > - /* The pre-request flush has finished */ > > - queue_work(md_wq, &mddev->flush_work); > > + if (atomic_dec_and_test(&fi->flush_pending)) { > > + if (fbio->bi_iter.bi_size == 0) { > > + /* an empty barrier - all done */ > > + bio_endio(fbio); > > + fi->fbio = NULL; > > + atomic_dec(&mddev->flush_io); > > + wake_up(&fi->flush_queue); > > + wake_up(&mddev->sb_wait); > > + } else { > > + INIT_WORK(&fi->flush_work, submit_flushes); > > + queue_work(md_wq, &fi->flush_work); > > + } > > } > > - bio_put(bio); > > -} > > > > -static void md_submit_flush_data(struct work_struct *ws); > > + bio_put(bi); > > +} > > > > -static void submit_flushes(struct work_struct *ws) > > +void md_flush_request(struct mddev *mddev, struct bio *fbio) > > { > > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > > struct md_rdev *rdev; > > + struct flush_info *fi; > > + char *p = (char*)mddev->flush_info; > > + int index; > > + > > + atomic_inc(&mddev->flush_io); > > + > > + index = jhash((void*)&fbio->bi_iter.bi_sector, > > + sizeof(sector_t), 0) % NR_FLUSHS; > > If this is a pure flush request, the bi_sector is meaningless. so probably we > could just use fbio to hash. Ah yes! It's a good idea. I'll re-send the patch soon. Regards Xiao > > > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > > + + mddev->raid_disks * sizeof(struct bio*))); > > + > > + spin_lock_irq(&fi->flush_lock); > > + wait_event_lock_irq(fi->flush_queue, > > + !fi->fbio, > > + fi->flush_lock); > > + fi->fbio = fbio; > > + spin_unlock_irq(&fi->flush_lock); > > + > > + fi->mddev = mddev; > > + atomic_set(&fi->flush_pending, 1); > > > > - INIT_WORK(&mddev->flush_work, md_submit_flush_data); > > - atomic_set(&mddev->flush_pending, 1); > > rcu_read_lock(); > > rdev_for_each_rcu(rdev, mddev) > > if (rdev->raid_disk >= 0 && > > @@ -448,56 +497,36 @@ static void submit_flushes(struct work_struct *ws) > > atomic_inc(&rdev->nr_pending); > > atomic_inc(&rdev->nr_pending); > > rcu_read_unlock(); > > + > > bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); > > - bi->bi_end_io = md_end_flush; > > - bi->bi_private = rdev; > > + bi->bi_end_io = rdev_end_flush; > > + bi->bi_private = fi; > > bio_set_dev(bi, rdev->bdev); > > bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; > > - atomic_inc(&mddev->flush_pending); > > + > > + fi->bios[rdev->raid_disk] = bi; > > + atomic_inc(&fi->flush_pending); > > submit_bio(bi); > > + > > rcu_read_lock(); > > rdev_dec_pending(rdev, mddev); > > } > > rcu_read_unlock(); > > - if (atomic_dec_and_test(&mddev->flush_pending)) > > - queue_work(md_wq, &mddev->flush_work); > > -} > > > > -static void md_submit_flush_data(struct work_struct *ws) > > -{ > > - struct mddev *mddev = container_of(ws, struct mddev, flush_work); > > - struct bio *bio = mddev->flush_bio; > > - > > - /* > > - * must reset flush_bio before calling into md_handle_request to avoid a > > - * deadlock, because other bios passed md_handle_request suspend check > > - * could wait for this and below md_handle_request could wait for those > > - * bios because of suspend check > > - */ > > - mddev->flush_bio = NULL; > > - wake_up(&mddev->sb_wait); > > - > > - if (bio->bi_iter.bi_size == 0) > > - /* an empty barrier - all done */ > > - bio_endio(bio); > > - else { > > - bio->bi_opf &= ~REQ_PREFLUSH; > > - md_handle_request(mddev, bio); > > + if (atomic_dec_and_test(&fi->flush_pending)) { > > + if (fbio->bi_iter.bi_size == 0) { > > + /* an empty barrier - all done */ > > + bio_endio(fbio); > > + fi->fbio = NULL; > > + atomic_dec(&mddev->flush_io); > > + wake_up(&fi->flush_queue); > > + wake_up(&mddev->sb_wait); > > + } else { > > + INIT_WORK(&fi->flush_work, submit_flushes); > > + queue_work(md_wq, &fi->flush_work); > > + } > > } > > } > > - > > -void md_flush_request(struct mddev *mddev, struct bio *bio) > > -{ > > - spin_lock_irq(&mddev->lock); > > - wait_event_lock_irq(mddev->sb_wait, > > - !mddev->flush_bio, > > - mddev->lock); > > - mddev->flush_bio = bio; > > - spin_unlock_irq(&mddev->lock); > > - > > - INIT_WORK(&mddev->flush_work, submit_flushes); > > - queue_work(md_wq, &mddev->flush_work); > > -} > > EXPORT_SYMBOL(md_flush_request); > > > > static inline struct mddev *mddev_get(struct mddev *mddev) > > @@ -555,7 +584,6 @@ void mddev_init(struct mddev *mddev) > > atomic_set(&mddev->openers, 0); > > atomic_set(&mddev->active_io, 0); > > spin_lock_init(&mddev->lock); > > - atomic_set(&mddev->flush_pending, 0); > > init_waitqueue_head(&mddev->sb_wait); > > init_waitqueue_head(&mddev->recovery_wait); > > mddev->reshape_position = MaxSector; > > @@ -5509,6 +5537,27 @@ int md_run(struct mddev *mddev) > > goto abort; > > } > > } > > + if (mddev->flush_info == NULL) { > > + int index = 0; > > + char *p; > > + struct flush_info *fi; > > + mddev->flush_info = kzalloc((sizeof(struct flush_info) + > > + sizeof(struct bio*) * mddev->raid_disks) * > > + NR_FLUSHS, GFP_KERNEL); > > + if (!mddev->flush_info) { > > + err = -ENOMEM; > > + goto abort; > > + } > > + > > + p = (char*)mddev->flush_info; > > + while (index < NR_FLUSHS) { > > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > > + + mddev->raid_disks * sizeof(struct bio*))); > > + spin_lock_init(&fi->flush_lock); > > + init_waitqueue_head(&fi->flush_queue); > > + index++; > > + } > > + } > > > > spin_lock(&pers_lock); > > pers = find_pers(mddev->level, mddev->clevel); > > @@ -5668,6 +5717,9 @@ int md_run(struct mddev *mddev) > > return 0; > > > > abort: > > + if (mddev->flush_info) { > > + kfree(mddev->flush_info); > > + } > > if (mddev->bio_set) { > > bioset_free(mddev->bio_set); > > mddev->bio_set = NULL; > > @@ -5888,6 +5940,10 @@ void md_stop(struct mddev *mddev) > > * This is called from dm-raid > > */ > > __md_stop(mddev); > > + if (mddev->flush_info) { > > + kfree(mddev->flush_info); > > + mddev->flush_info = NULL; > > + } > > if (mddev->bio_set) { > > bioset_free(mddev->bio_set); > > mddev->bio_set = NULL; > > @@ -6854,8 +6910,10 @@ static int update_size(struct mddev *mddev, sector_t > > num_sectors) > > > > static int update_raid_disks(struct mddev *mddev, int raid_disks) > > { > > - int rv; > > + int rv, index; > > struct md_rdev *rdev; > > + struct flush_info *new, *fi; > > + char *p; > > /* change the number of raid disks */ > > if (mddev->pers->check_reshape == NULL) > > return -EINVAL; > > @@ -6884,10 +6942,31 @@ static int update_raid_disks(struct mddev *mddev, > > int raid_disks) > > else if (mddev->delta_disks > 0) > > mddev->reshape_backwards = 0; > > > > + new = kzalloc((sizeof(struct flush_info) + sizeof(struct bio*) * > > + raid_disks) * NR_FLUSHS, GFP_KERNEL); > > + if (!new) > > + return -ENOMEM; > > + > > + p = (char*)new; > > + index = 0; > > + while (index < NR_FLUSHS) { > > + fi = (struct flush_info *)(p + index * (sizeof(struct flush_info) > > + + raid_disks * sizeof(struct bio*))); > > + spin_lock_init(&fi->flush_lock); > > + init_waitqueue_head(&fi->flush_queue); > > + index++; > > + } > > + > > rv = mddev->pers->check_reshape(mddev); > > if (rv < 0) { > > mddev->delta_disks = 0; > > mddev->reshape_backwards = 0; > > + kfree(new); > > + } else { > > + mddev_suspend(mddev); > > + kfree(mddev->flush_info); > > + mddev->flush_info = new; > > + mddev_resume(mddev); > > } > > return rv; > > } > > diff --git a/drivers/md/md.h b/drivers/md/md.h > > index fbc925c..24f22f5 100644 > > --- a/drivers/md/md.h > > +++ b/drivers/md/md.h > > @@ -252,6 +252,17 @@ enum mddev_sb_flags { > > MD_SB_NEED_REWRITE, /* metadata write needs to be repeated */ > > }; > > > > +#define NR_FLUSHS 16 > > +struct flush_info { > > + struct mddev *mddev; > > + struct work_struct flush_work; > > + atomic_t flush_pending; > > + spinlock_t flush_lock; > > + wait_queue_head_t flush_queue; > > + struct bio *fbio; > > + struct bio *bios[0]; > > +}; > > + > > struct mddev { > > void *private; > > struct md_personality *pers; > > @@ -399,7 +410,6 @@ struct mddev { > > struct work_struct del_work; /* used for delayed sysfs removal */ > > > > /* "lock" protects: > > - * flush_bio transition from NULL to !NULL > > * rdev superblocks, events > > * clearing MD_CHANGE_* > > * in_sync - and related safemode and MD_CHANGE changes > > @@ -457,13 +467,12 @@ struct mddev { > > * metadata and bitmap writes > > */ > > > > - /* Generic flush handling. > > - * The last to finish preflush schedules a worker to submit > > - * the rest of the request (without the REQ_PREFLUSH flag). > > + /* > > + * Generic flush handling. > > */ > > - struct bio *flush_bio; > > - atomic_t flush_pending; > > - struct work_struct flush_work; > > + struct flush_info *flush_info; > > + atomic_t flush_io; > > + > > struct work_struct event_work; /* used by dm to report failure event */ > > void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); > > struct md_cluster_info *cluster_info; > > -- > > 2.7.4 > > > -- > To unsubscribe from this list: send the line "unsubscribe linux-raid" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html