Re: [PATCH V1 3/3] improve raid10 discard request

Coly Li <colyli@xxxxxxx> · Mon, 20 Jul 2020 13:38:09 +0800

On 2020/7/20 13:26, Coly Li wrote:
> On 2020/7/20 12:58, Xiao Ni wrote:
>> Now the discard request is split by chunk size. So it takes a long time to finish mkfs on
>> disks which support discard function. This patch improve handling raid10 discard request.
>> It uses  the similar way with raid0(29efc390b).
>>
>> But it's a little complex than raid0. Because raid10 has different layout. If raid10 is
>> offset layout and the discard request is smaller than stripe size. There are some holes
>> when we submit discard bio to underlayer disks.
>>
>> For example: five disks (disk1 - disk5)
>> D01 D02 D03 D04 D05
>> D05 D01 D02 D03 D04
>> D06 D07 D08 D09 D10
>> D10 D06 D07 D08 D09
>> The discard bio just wants to discard from D03 to D10. For disk3, there is a hole between
>> D03 and D08. For disk4, there is a hole between D04 and D09. D03 is a chunk, raid10_write_request
>> can handle one chunk perfectly. So the part that is not aligned with stripe size is still
>> handled by raid10_write_request.
>>
>> If reshape is running when discard bio comes and the discard bio spans the reshape position,
>> raid10_write_request is responsible to handle this discard bio.
>>
>> I did a test with this patch set.
>> Without patch:
>> time mkfs.xfs /dev/md0
>> real4m39.775s
>> user0m0.000s
>> sys0m0.298s
>>
>> With patch:
>> time mkfs.xfs /dev/md0
>> real0m0.105s
>> user0m0.000s
>> sys0m0.007s
>>
>> nvme3n1           259:1    0   477G  0 disk
>> └─nvme3n1p1       259:10   0    50G  0 part
>> nvme4n1           259:2    0   477G  0 disk
>> └─nvme4n1p1       259:11   0    50G  0 part
>> nvme5n1           259:6    0   477G  0 disk
>> └─nvme5n1p1       259:12   0    50G  0 part
>> nvme2n1           259:9    0   477G  0 disk
>> └─nvme2n1p1       259:15   0    50G  0 part
>> nvme0n1           259:13   0   477G  0 disk
>> └─nvme0n1p1       259:14   0    50G  0 part
>>
>> v1:
>> Coly helps to review these patches and give some suggestions:
>> One bug is found. If discard bio is across one stripe but bio size is bigger
>> than one stripe size. After spliting, the bio will be NULL. In this version,
>> it checks whether discard bio size is bigger than 2*stripe_size.
>> In raid10_end_discard_request, it's better to check R10BIO_Uptodate is set
>> or not. It can avoid write memory to improve performance.
>> Add more comments for calculating addresses.
>>
>> Signed-off-by: Xiao Ni <xni@xxxxxxxxxx>
> 
> The code looks good to me, you may add my Reviewed-by: Coly Li
> <colyli@xxxxxxx>
> 
> But I still suggest you to add a more detailed commit log, to explain
> how the discard range of each component disk is decided. Anyway this is
> just a suggestion, it is up to you.
> 
> Thank you for this work.

In raid10_end_discard_request(), the first 5 lines for local variables
declaration, there are 3 lines starts with improper blanks (should be
tab '\t'). You may find this minor issue by checkpatch.pl I guess.

After fixing this format issue, you stil have my Reviewed-by :-)

Coly Li

> 
>> ---
>>  drivers/md/raid10.c | 276 +++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  1 file changed, 275 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
>> index 2a7423e..9568c23 100644
>> --- a/drivers/md/raid10.c
>> +++ b/drivers/md/raid10.c
>> @@ -1518,6 +1518,271 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
>>  		raid10_write_request(mddev, bio, r10_bio);
>>  }
>>  
>> +static void wait_blocked_dev(struct mddev *mddev, int cnt)
>> +{
>> +	int i;
>> +	struct r10conf *conf = mddev->private;
>> +	struct md_rdev *blocked_rdev = NULL;
>> +
>> +retry_wait:
>> +	rcu_read_lock();
>> +	for (i = 0; i < cnt; i++) {
>> +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
>> +		struct md_rdev *rrdev = rcu_dereference(
>> +			conf->mirrors[i].replacement);
>> +		if (rdev == rrdev)
>> +			rrdev = NULL;
>> +		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
>> +			atomic_inc(&rdev->nr_pending);
>> +			blocked_rdev = rdev;
>> +			break;
>> +		}
>> +		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
>> +			atomic_inc(&rrdev->nr_pending);
>> +			blocked_rdev = rrdev;
>> +			break;
>> +		}
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	if (unlikely(blocked_rdev)) {
>> +		/* Have to wait for this device to get unblocked, then retry */
>> +		allow_barrier(conf);
>> +		raid10_log(conf->mddev, "%s wait rdev %d blocked", __func__, blocked_rdev->raid_disk);
>> +		md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> +		wait_barrier(conf);
>> +		goto retry_wait;
>> +	}
>> +}
>> +
>> +static struct bio *raid10_split_bio(struct r10conf *conf,
>> +			struct bio *bio, sector_t sectors, bool want_first)
>> +{
>> +	struct bio *split;
>> +
>> +	split = bio_split(bio, sectors,	GFP_NOIO, &conf->bio_split);
>> +	bio_chain(split, bio);
>> +	allow_barrier(conf);
>> +	if (want_first) {
>> +		submit_bio_noacct(bio);
>> +		bio = split;
>> +	} else
>> +		submit_bio_noacct(split);
>> +	wait_barrier(conf);
>> +
>> +	return bio;
>> +}
>> +
>> +static void raid10_end_discard_request(struct bio *bio)
>> +{
>> +	struct r10bio *r10_bio = bio->bi_private;
>> +        struct r10conf *conf = r10_bio->mddev->private;
>> +	struct md_rdev *rdev = NULL;
>> +        int dev;
>> +        int slot, repl;
>> +
>> +	/*
>> +	 * We don't care the return value of discard bio
>> +	 */
>> +	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
>> +		set_bit(R10BIO_Uptodate, &r10_bio->state);
>> +
>> +	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
>> +	if (repl)
>> +		rdev = conf->mirrors[dev].replacement;
>> +	if (!rdev) {
>> +		smp_rmb();
>> +		repl = 0;
>> +		rdev = conf->mirrors[dev].rdev;
>> +	}
>> +
>> +	if (atomic_dec_and_test(&r10_bio->remaining)) {
>> +		md_write_end(r10_bio->mddev);
>> +		raid_end_bio_io(r10_bio);
>> +	}
>> +
>> +	rdev_dec_pending(rdev, conf->mddev);
>> +}
>> +
>> +static bool raid10_handle_discard(struct mddev *mddev, struct bio *bio)
>> +{
>> +	struct r10conf *conf = mddev->private;
>> +	struct geom geo = conf->geo;
>> +	struct r10bio *r10_bio;
>> +
>> +	int disk;
>> +	sector_t chunk;
>> +	int stripe_size, stripe_mask;
>> +
>> +	sector_t bio_start, bio_end;
>> +	sector_t first_stripe_index, last_stripe_index;
>> +	sector_t start_disk_offset;
>> +	unsigned int start_disk_index;
>> +	sector_t end_disk_offset;
>> +	unsigned int end_disk_index;
>> +
>> +	wait_barrier(conf);
>> +
>> +	if (conf->reshape_progress != MaxSector &&
>> +	    ((bio->bi_iter.bi_sector >= conf->reshape_progress) !=
>> +	     conf->mddev->reshape_backwards))
>> +		geo = conf->prev;
>> +
>> +	stripe_size = (1<<geo.chunk_shift) * geo.raid_disks;
>> +	stripe_mask = stripe_size - 1;
>> +
>> +	/* Maybe one discard bio is smaller than strip size or across one stripe
>> +	 * and discard region is larger than one stripe size. For far offset layout,
>> +	 * if the discard region is not aligned with stripe size, there is hole
>> +	 * when we submit discard bio to member disk. For simplicity, we only
>> +	 * handle discard bio which discard region is bigger than stripe_size*2
>> +	 */
>> +	if (bio_sectors(bio) < stripe_size*2)
>> +		goto out;
>> +
>> +	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
>> +	     bio->bi_iter.bi_sector < conf->reshape_progress &&
>> +	     bio->bi_iter.bi_sector + bio_sectors(bio) > conf->reshape_progress)
>> +		goto out;
>> +
>> +	r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
>> +	r10_bio->mddev = mddev;
>> +	r10_bio->state = 0;
>> +	memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
>> +
>> +	wait_blocked_dev(mddev, geo.raid_disks);
>> +
>> +	/* For far offset layout, if bio is not aligned with stripe size, it splits
>> +	 * the part that is not aligned with strip size.
>> +	 */
>> +	bio_start = bio->bi_iter.bi_sector;
>> +	bio_end = bio_end_sector(bio);
>> +	if (geo.far_offset && (bio_start & stripe_mask)) {
>> +		sector_t split_size;
>> +		split_size = round_up(bio_start, stripe_size) - bio_start;
>> +		bio = raid10_split_bio(conf, bio, split_size, false);
>> +	}
>> +	if (geo.far_offset && ((bio_end & stripe_mask) != stripe_mask)) {
>> +		sector_t split_size;
>> +		split_size = bio_end - (bio_end & stripe_mask);
>> +		bio = raid10_split_bio(conf, bio, split_size, true);
>> +	}
>> +	r10_bio->master_bio = bio;
>> +
>> +	bio_start = bio->bi_iter.bi_sector;
>> +	bio_end = bio_end_sector(bio);
>> +
>> +	/* raid10 uses chunk as the unit to store data. It's similar like raid0.
>> +	 * One stripe contains the chunks from all member disk (one chunk from
>> +	 * one disk at the same HAB address). For layout detail, see 'man md 4'
>> +	 */
>> +	chunk = bio_start >> geo.chunk_shift;
>> +	chunk *= geo.near_copies;
>> +	first_stripe_index = chunk;
>> +	start_disk_index = sector_div(first_stripe_index, geo.raid_disks);
>> +	if (geo.far_offset)
>> +		first_stripe_index *= geo.far_copies;
>> +	start_disk_offset = (bio_start & geo.chunk_mask) +
>> +				(first_stripe_index << geo.chunk_shift);
>> +
>> +	chunk = bio_end >> geo.chunk_shift;
>> +	chunk *= geo.near_copies;
>> +	last_stripe_index = chunk;
>> +	end_disk_index = sector_div(last_stripe_index, geo.raid_disks);
>> +	if (geo.far_offset)
>> +		last_stripe_index *= geo.far_copies;
>> +	end_disk_offset = (bio_end & geo.chunk_mask) +
>> +				(last_stripe_index << geo.chunk_shift);
>> +
>> +	rcu_read_lock();
>> +	for (disk = 0; disk < geo.raid_disks; disk++) {
>> +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
>> +		struct md_rdev *rrdev = rcu_dereference(
>> +			conf->mirrors[disk].replacement);
>> +
>> +		r10_bio->devs[disk].bio = NULL;
>> +		r10_bio->devs[disk].repl_bio = NULL;
>> +
>> +		if (rdev && (test_bit(Faulty, &rdev->flags)))
>> +			rdev = NULL;
>> +		if (rrdev && (test_bit(Faulty, &rrdev->flags)))
>> +			rrdev = NULL;
>> +		if (!rdev && !rrdev)
>> +			continue;
>> +
>> +		if (rdev) {
>> +			r10_bio->devs[disk].bio = bio;
>> +			atomic_inc(&rdev->nr_pending);
>> +		}
>> +		if (rrdev) {
>> +			r10_bio->devs[disk].repl_bio = bio;
>> +			atomic_inc(&rrdev->nr_pending);
>> +		}
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	atomic_set(&r10_bio->remaining, 1);
>> +	for (disk = 0; disk < geo.raid_disks; disk++) {
>> +		sector_t dev_start, dev_end;
>> +		struct bio *mbio, *rbio = NULL;
>> +		struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
>> +		struct md_rdev *rrdev = rcu_dereference(
>> +			conf->mirrors[disk].replacement);
>> +
>> +		/*
>> +		 * Now start to calculate the start and end address for each disk.
>> +		 * The space between dev_start and dev_end is the discard region.
>> +		 */
>> +		if (disk < start_disk_index)
>> +			dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
>> +		else if (disk > start_disk_index)
>> +			dev_start = first_stripe_index * mddev->chunk_sectors;
>> +		else
>> +			dev_start = start_disk_offset;
>> +
>> +		if (disk < end_disk_index)
>> +			dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
>> +		else if (disk > end_disk_index)
>> +			dev_end = last_stripe_index * mddev->chunk_sectors;
>> +		else
>> +			dev_end = end_disk_offset;
>> +
>> +		/* It only handles discard bio which size is >= stripe size, so
>> +		 * dev_end > dev_start all the time
>> +		 */
>> +		if (r10_bio->devs[disk].bio) {
>> +			mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
>> +			mbio->bi_end_io = raid10_end_discard_request;
>> +			mbio->bi_private = r10_bio;
>> +			r10_bio->devs[disk].bio = mbio;
>> +			r10_bio->devs[disk].devnum = disk;
>> +			atomic_inc(&r10_bio->remaining);
>> +			md_submit_discard_bio(mddev, rdev, mbio, dev_start, dev_end);
>> +			bio_endio(mbio);
>> +		}
>> +		if (r10_bio->devs[disk].repl_bio) {
>> +			rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
>> +			rbio->bi_end_io = raid10_end_discard_request;
>> +			rbio->bi_private = r10_bio;
>> +			r10_bio->devs[disk].repl_bio = rbio;
>> +			r10_bio->devs[disk].devnum = disk;
>> +			atomic_inc(&r10_bio->remaining);
>> +			md_submit_discard_bio(mddev, rrdev, rbio, dev_start, dev_end);
>> +			bio_endio(rbio);
>> +		}
>> +	}
>> +
>> +	if (atomic_dec_and_test(&r10_bio->remaining)) {
>> +		md_write_end(r10_bio->mddev);
>> +		raid_end_bio_io(r10_bio);
>> +	}
>> +
>> +	return 0;
>> +out:
>> +	allow_barrier(conf);
>> +	return -EAGAIN;
>> +}
>> +
>>  static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
>>  {
>>  	struct r10conf *conf = mddev->private;
>> @@ -1532,6 +1797,15 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
>>  	if (!md_write_start(mddev, bio))
>>  		return false;
>>  
>> +	/* There are some limitations to handle discard bio
>> +	 * 1st, the discard size is bigger than stripe size.
>> +	 * 2st, if the discard bio spans reshape progress, we use the old way to
>> +	 * handle discard bio
>> +	 */
>> +	if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
>> +		if (!raid10_handle_discard(mddev, bio))
>> +			return true;
>> +
>>  	/*
>>  	 * If this request crosses a chunk boundary, we need to split
>>  	 * it.
>> @@ -3762,7 +4036,7 @@ static int raid10_run(struct mddev *mddev)
>>  	chunk_size = mddev->chunk_sectors << 9;
>>  	if (mddev->queue) {
>>  		blk_queue_max_discard_sectors(mddev->queue,
>> -					      mddev->chunk_sectors);
>> +					      UINT_MAX);
>>  		blk_queue_max_write_same_sectors(mddev->queue, 0);
>>  		blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
>>  		blk_queue_io_min(mddev->queue, chunk_size);
>>
>