Re: Re: [PATCH 4/4] raid1: Rewrite the implementation of iobarrier.

majianpeng <majianpeng@xxxxxxxxx> · Thu, 31 Oct 2013 11:20:55 +0800



>On Tue, 29 Oct 2013 09:30:14 +0800 majianpeng <majianpeng@xxxxxxxxx> wrote:
>
>
>Nearly there!!  Just a few more details.  See below.
>
>
>> Signed-off-by: Jianpeng Ma <majianpeng@xxxxxxxxx>
>> ---
>>  drivers/md/raid1.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++------
>>  drivers/md/raid1.h |  14 ++++++
>>  2 files changed, 124 insertions(+), 12 deletions(-)
>> 
>> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
>> index b4a6dcd..5b311c0 100644
>> --- a/drivers/md/raid1.c
>> +++ b/drivers/md/raid1.c
>> @@ -66,7 +66,8 @@
>>   */
>>  static int max_queued_requests = 1024;
>>  
>> -static void allow_barrier(struct r1conf *conf);
>> +static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
>> +				sector_t bi_sector);
>>  static void lower_barrier(struct r1conf *conf);
>>  
>>  static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
>> @@ -227,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
>>  	struct bio *bio = r1_bio->master_bio;
>>  	int done;
>>  	struct r1conf *conf = r1_bio->mddev->private;
>> +	sector_t start_next_window = r1_bio->start_next_window;
>> +	sector_t bi_sector = bio->bi_sector;
>
>This should  be r1_bio->sector, not bio->bi_sector.
>They are often the same but if multiple r1_bios are needed for some reason
>(e.g. bad blocks) they may not be.
>
No, allow_barrier() only for bio not for r1bio.It only do when bio->bi_phys_segments == 0.
If this value is r1_bio->sector, the value may has different value as you said.
So in allow_barrier():
       if (start_next_window) {
                if (start_next_window == conf->start_next_window) {
                        if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
                                <= bi_sector)
                                conf->next_window_requests--;
                        else 
                                conf->current_window_requests--;
                } else 
It will has differnt result.

>>  
>>  	if (bio->bi_phys_segments) {
>>  		unsigned long flags;
>> @@ -234,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
>>  		bio->bi_phys_segments--;
>>  		done = (bio->bi_phys_segments == 0);
>>  		spin_unlock_irqrestore(&conf->device_lock, flags);
>> +		/*
>> +		 * make_request() might be waiting for
>> +		 * bi_phys_segments to decrease
>> +		 */
>> +		wake_up(&conf->wait_barrier);
>>  	} else
>>  		done = 1;
>>  
>> @@ -245,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
>>  		 * Wake up any possible resync thread that waits for the device
>>  		 * to go idle.
>>  		 */
>> -		allow_barrier(conf);
>> +		allow_barrier(conf, start_next_window, bi_sector);
>>  	}
>>  }
>>  
>> @@ -827,10 +835,18 @@ static void raise_barrier(struct r1conf *conf)
>>  	/* block any new IO from starting */
>>  	conf->barrier++;
>>  
>> -	/* Now wait for all pending IO to complete */
>> +	/* For those conditions we must wait:
>> +	 * A:while the array is in frozen state
>> +	 * B:while barrier >= RESYNC_DEPTH, meaning resync reach
>> +	 *   the max count which allowed.
>> +	 * C:next_resync + RESYNC_SECTORS > start_next_window, meaning
>> +	 *   next resync will reach to window which normal bios are handling.
>> +	 */
>>  	wait_event_lock_irq(conf->wait_barrier,
>>  			    !conf->array_frozen &&
>> -			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
>> +			    conf->barrier < RESYNC_DEPTH &&
>> +			    (conf->start_next_window >=
>> +			     conf->next_resync + RESYNC_SECTORS),
>>  			    conf->resync_lock);
>
>You've removed the test on conf->nr_pending here, which I think is correct.
>It counts 'read' requests as well.  Testing start_next_window serves the same
>purpose as it is increased whenever current_window_requests reaches zero.
>
In func wait_barrier():
        if (need_to_wait_for_sync(conf, bio)) {
              [snip]
        } else if (bio_data_dir(bio) == WRITE) {
                if (conf->next_resync + NEXT_NORMALIO_DISTANCE
                                <= bio->bi_sector) {
                        if (conf->start_next_window == MaxSector)
                                conf->start_next_window =
                                        conf->next_resync +
                                                NEXT_NORMALIO_DISTANCE;

                        if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
                                <= bio->bi_sector)
                                conf->next_window_requests++;
                        else
                                conf->current_window_requests++;
                }
                if (bio->bi_sector >= conf->start_next_window)
                        sector = conf->start_next_window;
        }
start_next_window only for bio_data_dir(bio) == WRITE. So resync can't wait read io to complete.

>However you having modified as similar test on nr_pending in wait_barrier().
>That worries me a bit.  Should that be changed to a test on start_next_window
>to match the above change?
For this condition, i only copy it.For this condition, i'm not know clearly.
Can you give me more details about this?
>
>
>>  
>>  	spin_unlock_irq(&conf->resync_lock);
>> @@ -846,10 +862,33 @@ static void lower_barrier(struct r1conf *conf)
>>  	wake_up(&conf->wait_barrier);
>>  }
>>  
>> -static void wait_barrier(struct r1conf *conf)
>> +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
>> +{
>> +	bool wait = false;
>> +
>> +	if (conf->array_frozen || !bio)
>> +		wait = true;
>> +	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
>> +		if (conf->next_resync < RESYNC_WINDOW_SECTORS)
>> +			wait = true;
>> +		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
>> +				>= bio_end_sector(bio)) ||
>> +			 (conf->next_resync + NEXT_NORMALIO_DISTANCE
>> +				<= bio->bi_sector))
>> +			wait = false;
>> +		else
>> +			wait = true;
>> +	}
>> +
>> +	return wait;
>> +}
>> +
>> +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
>>  {
>> +	sector_t sector = 0;
>> +
>>  	spin_lock_irq(&conf->resync_lock);
>> -	if (conf->barrier) {
>> +	if (need_to_wait_for_sync(conf, bio)) {
>>  		conf->nr_waiting++;
>>  		/* Wait for the barrier to drop.
>>  		 * However if there are already pending
>> @@ -868,16 +907,57 @@ static void wait_barrier(struct r1conf *conf)
>>  				     !bio_list_empty(current->bio_list))),
>>  				    conf->resync_lock);
>>  		conf->nr_waiting--;
>> +	} else if (bio_data_dir(bio) == WRITE) {
>> +		if (conf->next_resync + NEXT_NORMALIO_DISTANCE
>> +				<= bio->bi_sector) {
>> +			if (conf->start_next_window == MaxSector)
>> +				conf->start_next_window =
>> +					conf->next_resync +
>> +						NEXT_NORMALIO_DISTANCE;
>> +
>> +			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
>> +				<= bio->bi_sector)
>> +				conf->next_window_requests++;
>> +			else
>> +				conf->current_window_requests++;
>> +		}
>> +		if (bio->bi_sector >= conf->start_next_window)
>> +			sector = conf->start_next_window;
>
>You aren't setting 'sector' if we needed to wait.  I don't think that is
>correct, is it?
>
Yes.How about those code:
spin_lock_irq(&conf->resync_lock);
retry_check:
        if (need_to_wait_for_sync(conf, bio)) {
                conf->nr_waiting++;
                /* Wait for the barrier to drop.
                 * However if there are already pending
                 * requests (preventing the barrier from
                 * rising completely), and the
                 * pre-process bio queue isn't empty,
                 * then don't wait, as we need to empty
                 * that queue to get the nr_pending
                 * count down.
                 */
                wait_event_lock_irq(conf->wait_barrier,
                                    !conf->array_frozen &&
                                    (!conf->barrier ||
                                    (conf->nr_pending &&
                                     current->bio_list &&
                                     !bio_list_empty(current->bio_list))),
                                    conf->resync_lock);
                conf->nr_waiting--;
				goto retry_check;
>
>>  	}
>> +
>>  	conf->nr_pending++;
>>  	spin_unlock_irq(&conf->resync_lock);
>> +	return sector;
>>  }
>>  
>> -static void allow_barrier(struct r1conf *conf)
>> +static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
>> +				sector_t bi_sector)
>>  {
>>  	unsigned long flags;
>> +
>>  	spin_lock_irqsave(&conf->resync_lock, flags);
>>  	conf->nr_pending--;
>> +	if (start_next_window) {
>> +		if (start_next_window == conf->start_next_window) {
>> +			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
>> +				<= bi_sector)
>> +				conf->next_window_requests--;
>> +			else
>> +				conf->current_window_requests--;
>> +		} else
>> +			conf->current_window_requests--;
>> +
>> +		if (!conf->current_window_requests) {
>> +			if (conf->next_window_requests) {
>> +				conf->current_window_requests =
>> +					conf->next_window_requests;
>> +				conf->next_window_requests = 0;
>> +				conf->start_next_window +=
>> +					NEXT_NORMALIO_DISTANCE;
>> +			} else
>> +				conf->start_next_window = MaxSector;
>> +		}
>> +	}
>>  	spin_unlock_irqrestore(&conf->resync_lock, flags);
>>  	wake_up(&conf->wait_barrier);
>>  }
>> @@ -1012,6 +1092,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
>>  	int first_clone;
>>  	int sectors_handled;
>>  	int max_sectors;
>> +	sector_t start_next_window;
>>  
>>  	/*
>>  	 * Register the new request and wait if the reconstruction
>> @@ -1041,7 +1122,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
>>  		finish_wait(&conf->wait_barrier, &w);
>>  	}
>>  
>> -	wait_barrier(conf);
>> +	start_next_window = wait_barrier(conf, bio);
>>  
>>  	bitmap = mddev->bitmap;
>>  
>> @@ -1162,6 +1243,7 @@ read_again:
>>  
>>  	disks = conf->raid_disks * 2;
>>   retry_write:
>> +	r1_bio->start_next_window = start_next_window;
>>  	blocked_rdev = NULL;
>>  	rcu_read_lock();
>>  	max_sectors = r1_bio->sectors;
>> @@ -1230,14 +1312,24 @@ read_again:
>>  	if (unlikely(blocked_rdev)) {
>>  		/* Wait for this device to become unblocked */
>>  		int j;
>> +		sector_t old = start_next_window;
>>  
>>  		for (j = 0; j < i; j++)
>>  			if (r1_bio->bios[j])
>>  				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>>  		r1_bio->state = 0;
>> -		allow_barrier(conf);
>> +		allow_barrier(conf, start_next_window, bio->bi_sector);
>
>I think this should be r1_bio->sector, not bio->bi_sector, for the same
>reason as earlier.
>
The exaplaination is the same at above.

Thanks!
Jianpeng Ma
>
>>  		md_wait_for_blocked_rdev(blocked_rdev, mddev);
>> -		wait_barrier(conf);
>> +		start_next_window = wait_barrier(conf, bio);
>> +		/*
>> +		 * We must make sure the multi r1bios of bio have
>> +		 * the same value of bi_phys_segments
>> +		 */
>> +		if (bio->bi_phys_segments && old &&
>> +			old != start_next_window)
>> +			/*wait the former r1bio(s) completed*/
>> +			wait_event(conf->wait_barrier,
>> +					bio->bi_phys_segments == 1);
>>  		goto retry_write;
>>  	}
>>  
>> @@ -1437,11 +1529,14 @@ static void print_conf(struct r1conf *conf)
>>  
>>  static void close_sync(struct r1conf *conf)
>>  {
>> -	wait_barrier(conf);
>> -	allow_barrier(conf);
>> +	wait_barrier(conf, NULL);
>> +	allow_barrier(conf, 0, 0);
>>  
>>  	mempool_destroy(conf->r1buf_pool);
>>  	conf->r1buf_pool = NULL;
>> +
>> +	conf->next_resync = 0;
>> +	conf->start_next_window = MaxSector;
>>  }
>>  
>>  static int raid1_spare_active(struct mddev *mddev)
>> @@ -2713,6 +2808,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
>>  	conf->pending_count = 0;
>>  	conf->recovery_disabled = mddev->recovery_disabled - 1;
>>  
>> +	conf->start_next_window = MaxSector;
>> +	conf->current_window_requests = conf->next_window_requests = 0;
>> +
>>  	err = -EIO;
>>  	for (i = 0; i < conf->raid_disks * 2; i++) {
>>  
>> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
>> index 331a98a..07425a1 100644
>> --- a/drivers/md/raid1.h
>> +++ b/drivers/md/raid1.h
>> @@ -41,6 +41,19 @@ struct r1conf {
>>  	 */
>>  	sector_t		next_resync;
>>  
>> +	/*when raid1 start resync,we divide raid into four partitions
>> +	 * |---------|--------------|---------------------|-------------|
>> +	 *        next_resync   start_next_window        Pc
>> +	 * Now start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
>> +	 * Pc = start_next_window + NEXT_NORMALIO_DISTANCE
>> +	 * current_window_requests means the count of normalIO between
>> +	 * start_next_window and Pc.
>> +	 * next_window_requests means the count of nornalIO after Pc.
>> +	 * */
>> +	sector_t		start_next_window;
>> +	int			current_window_requests;
>> +	int			next_window_requests;
>> +
>>  	spinlock_t		device_lock;
>>  
>>  	/* list of 'struct r1bio' that need to be processed by raid1d,
>> @@ -112,6 +125,7 @@ struct r1bio {
>>  						 * in this BehindIO request
>>  						 */
>>  	sector_t		sector;
>> +	sector_t		start_next_window;
>>  	int			sectors;
>>  	unsigned long		state;
>>  	struct mddev		*mddev;
>
>
>Thanks,
>NeilBrown
>?韬{.n?????%??檩??w?{.n???{炳盯w???塄}?财??j:+v??????2??璀??摺?囤??z夸z罐?+?????w棹f