Re: [PATCH 4/4] raid1: Rewrite the implementation of iobarrier.

NeilBrown <neilb@xxxxxxx> · Thu, 31 Oct 2013 13:33:14 +1100

On Tue, 29 Oct 2013 09:30:14 +0800 majianpeng <majianpeng@xxxxxxxxx> wrote:

Nearly there!!  Just a few more details.  See below.

> Signed-off-by: Jianpeng Ma <majianpeng@xxxxxxxxx>
> ---
>  drivers/md/raid1.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++------
>  drivers/md/raid1.h |  14 ++++++
>  2 files changed, 124 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index b4a6dcd..5b311c0 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -66,7 +66,8 @@
>   */
>  static int max_queued_requests = 1024;
>  
> -static void allow_barrier(struct r1conf *conf);
> +static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
> +				sector_t bi_sector);
>  static void lower_barrier(struct r1conf *conf);
>  
>  static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
> @@ -227,6 +228,8 @@ static void call_bio_endio(struct r1bio *r1_bio)
>  	struct bio *bio = r1_bio->master_bio;
>  	int done;
>  	struct r1conf *conf = r1_bio->mddev->private;
> +	sector_t start_next_window = r1_bio->start_next_window;
> +	sector_t bi_sector = bio->bi_sector;

This should  be r1_bio->sector, not bio->bi_sector.
They are often the same but if multiple r1_bios are needed for some reason
(e.g. bad blocks) they may not be.

>  
>  	if (bio->bi_phys_segments) {
>  		unsigned long flags;
> @@ -234,6 +237,11 @@ static void call_bio_endio(struct r1bio *r1_bio)
>  		bio->bi_phys_segments--;
>  		done = (bio->bi_phys_segments == 0);
>  		spin_unlock_irqrestore(&conf->device_lock, flags);
> +		/*
> +		 * make_request() might be waiting for
> +		 * bi_phys_segments to decrease
> +		 */
> +		wake_up(&conf->wait_barrier);
>  	} else
>  		done = 1;
>  
> @@ -245,7 +253,7 @@ static void call_bio_endio(struct r1bio *r1_bio)
>  		 * Wake up any possible resync thread that waits for the device
>  		 * to go idle.
>  		 */
> -		allow_barrier(conf);
> +		allow_barrier(conf, start_next_window, bi_sector);
>  	}
>  }
>  
> @@ -827,10 +835,18 @@ static void raise_barrier(struct r1conf *conf)
>  	/* block any new IO from starting */
>  	conf->barrier++;
>  
> -	/* Now wait for all pending IO to complete */
> +	/* For those conditions we must wait:
> +	 * A:while the array is in frozen state
> +	 * B:while barrier >= RESYNC_DEPTH, meaning resync reach
> +	 *   the max count which allowed.
> +	 * C:next_resync + RESYNC_SECTORS > start_next_window, meaning
> +	 *   next resync will reach to window which normal bios are handling.
> +	 */
>  	wait_event_lock_irq(conf->wait_barrier,
>  			    !conf->array_frozen &&
> -			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
> +			    conf->barrier < RESYNC_DEPTH &&
> +			    (conf->start_next_window >=
> +			     conf->next_resync + RESYNC_SECTORS),
>  			    conf->resync_lock);

You've removed the test on conf->nr_pending here, which I think is correct.
It counts 'read' requests as well.  Testing start_next_window serves the same
purpose as it is increased whenever current_window_requests reaches zero.

However you having modified as similar test on nr_pending in wait_barrier().
That worries me a bit.  Should that be changed to a test on start_next_window
to match the above change?

>  
>  	spin_unlock_irq(&conf->resync_lock);
> @@ -846,10 +862,33 @@ static void lower_barrier(struct r1conf *conf)
>  	wake_up(&conf->wait_barrier);
>  }
>  
> -static void wait_barrier(struct r1conf *conf)
> +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio)
> +{
> +	bool wait = false;
> +
> +	if (conf->array_frozen || !bio)
> +		wait = true;
> +	else if (conf->barrier && bio_data_dir(bio) == WRITE) {
> +		if (conf->next_resync < RESYNC_WINDOW_SECTORS)
> +			wait = true;
> +		else if ((conf->next_resync - RESYNC_WINDOW_SECTORS
> +				>= bio_end_sector(bio)) ||
> +			 (conf->next_resync + NEXT_NORMALIO_DISTANCE
> +				<= bio->bi_sector))
> +			wait = false;
> +		else
> +			wait = true;
> +	}
> +
> +	return wait;
> +}
> +
> +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
>  {
> +	sector_t sector = 0;
> +
>  	spin_lock_irq(&conf->resync_lock);
> -	if (conf->barrier) {
> +	if (need_to_wait_for_sync(conf, bio)) {
>  		conf->nr_waiting++;
>  		/* Wait for the barrier to drop.
>  		 * However if there are already pending
> @@ -868,16 +907,57 @@ static void wait_barrier(struct r1conf *conf)
>  				     !bio_list_empty(current->bio_list))),
>  				    conf->resync_lock);
>  		conf->nr_waiting--;
> +	} else if (bio_data_dir(bio) == WRITE) {
> +		if (conf->next_resync + NEXT_NORMALIO_DISTANCE
> +				<= bio->bi_sector) {
> +			if (conf->start_next_window == MaxSector)
> +				conf->start_next_window =
> +					conf->next_resync +
> +						NEXT_NORMALIO_DISTANCE;
> +
> +			if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE)
> +				<= bio->bi_sector)
> +				conf->next_window_requests++;
> +			else
> +				conf->current_window_requests++;
> +		}
> +		if (bio->bi_sector >= conf->start_next_window)
> +			sector = conf->start_next_window;

You aren't setting 'sector' if we needed to wait.  I don't think that is
correct, is it?

>  	}
> +
>  	conf->nr_pending++;
>  	spin_unlock_irq(&conf->resync_lock);
> +	return sector;
>  }
>  
> -static void allow_barrier(struct r1conf *conf)
> +static void allow_barrier(struct r1conf *conf, sector_t start_next_window,
> +				sector_t bi_sector)
>  {
>  	unsigned long flags;
> +
>  	spin_lock_irqsave(&conf->resync_lock, flags);
>  	conf->nr_pending--;
> +	if (start_next_window) {
> +		if (start_next_window == conf->start_next_window) {
> +			if (conf->start_next_window + NEXT_NORMALIO_DISTANCE
> +				<= bi_sector)
> +				conf->next_window_requests--;
> +			else
> +				conf->current_window_requests--;
> +		} else
> +			conf->current_window_requests--;
> +
> +		if (!conf->current_window_requests) {
> +			if (conf->next_window_requests) {
> +				conf->current_window_requests =
> +					conf->next_window_requests;
> +				conf->next_window_requests = 0;
> +				conf->start_next_window +=
> +					NEXT_NORMALIO_DISTANCE;
> +			} else
> +				conf->start_next_window = MaxSector;
> +		}
> +	}
>  	spin_unlock_irqrestore(&conf->resync_lock, flags);
>  	wake_up(&conf->wait_barrier);
>  }
> @@ -1012,6 +1092,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
>  	int first_clone;
>  	int sectors_handled;
>  	int max_sectors;
> +	sector_t start_next_window;
>  
>  	/*
>  	 * Register the new request and wait if the reconstruction
> @@ -1041,7 +1122,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
>  		finish_wait(&conf->wait_barrier, &w);
>  	}
>  
> -	wait_barrier(conf);
> +	start_next_window = wait_barrier(conf, bio);
>  
>  	bitmap = mddev->bitmap;
>  
> @@ -1162,6 +1243,7 @@ read_again:
>  
>  	disks = conf->raid_disks * 2;
>   retry_write:
> +	r1_bio->start_next_window = start_next_window;
>  	blocked_rdev = NULL;
>  	rcu_read_lock();
>  	max_sectors = r1_bio->sectors;
> @@ -1230,14 +1312,24 @@ read_again:
>  	if (unlikely(blocked_rdev)) {
>  		/* Wait for this device to become unblocked */
>  		int j;
> +		sector_t old = start_next_window;
>  
>  		for (j = 0; j < i; j++)
>  			if (r1_bio->bios[j])
>  				rdev_dec_pending(conf->mirrors[j].rdev, mddev);
>  		r1_bio->state = 0;
> -		allow_barrier(conf);
> +		allow_barrier(conf, start_next_window, bio->bi_sector);

I think this should be r1_bio->sector, not bio->bi_sector, for the same
reason as earlier.

>  		md_wait_for_blocked_rdev(blocked_rdev, mddev);
> -		wait_barrier(conf);
> +		start_next_window = wait_barrier(conf, bio);
> +		/*
> +		 * We must make sure the multi r1bios of bio have
> +		 * the same value of bi_phys_segments
> +		 */
> +		if (bio->bi_phys_segments && old &&
> +			old != start_next_window)
> +			/*wait the former r1bio(s) completed*/
> +			wait_event(conf->wait_barrier,
> +					bio->bi_phys_segments == 1);
>  		goto retry_write;
>  	}
>  
> @@ -1437,11 +1529,14 @@ static void print_conf(struct r1conf *conf)
>  
>  static void close_sync(struct r1conf *conf)
>  {
> -	wait_barrier(conf);
> -	allow_barrier(conf);
> +	wait_barrier(conf, NULL);
> +	allow_barrier(conf, 0, 0);
>  
>  	mempool_destroy(conf->r1buf_pool);
>  	conf->r1buf_pool = NULL;
> +
> +	conf->next_resync = 0;
> +	conf->start_next_window = MaxSector;
>  }
>  
>  static int raid1_spare_active(struct mddev *mddev)
> @@ -2713,6 +2808,9 @@ static struct r1conf *setup_conf(struct mddev *mddev)
>  	conf->pending_count = 0;
>  	conf->recovery_disabled = mddev->recovery_disabled - 1;
>  
> +	conf->start_next_window = MaxSector;
> +	conf->current_window_requests = conf->next_window_requests = 0;
> +
>  	err = -EIO;
>  	for (i = 0; i < conf->raid_disks * 2; i++) {
>  
> diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
> index 331a98a..07425a1 100644
> --- a/drivers/md/raid1.h
> +++ b/drivers/md/raid1.h
> @@ -41,6 +41,19 @@ struct r1conf {
>  	 */
>  	sector_t		next_resync;
>  
> +	/*when raid1 start resync,we divide raid into four partitions
> +	 * |---------|--------------|---------------------|-------------|
> +	 *        next_resync   start_next_window        Pc
> +	 * Now start_next_window = next_resync + NEXT_NORMALIO_DISTANCE
> +	 * Pc = start_next_window + NEXT_NORMALIO_DISTANCE
> +	 * current_window_requests means the count of normalIO between
> +	 * start_next_window and Pc.
> +	 * next_window_requests means the count of nornalIO after Pc.
> +	 * */
> +	sector_t		start_next_window;
> +	int			current_window_requests;
> +	int			next_window_requests;
> +
>  	spinlock_t		device_lock;
>  
>  	/* list of 'struct r1bio' that need to be processed by raid1d,
> @@ -112,6 +125,7 @@ struct r1bio {
>  						 * in this BehindIO request
>  						 */
>  	sector_t		sector;
> +	sector_t		start_next_window;
>  	int			sectors;
>  	unsigned long		state;
>  	struct mddev		*mddev;

Thanks,
NeilBrown
Attachment:
signature.asc

Description: PGP signature