Re: [patch 1/2]MD: raid5 trim support

NeilBrown <neilb@xxxxxxx> · Thu, 20 Sep 2012 11:15:17 +1000

On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@xxxxxxxxxx> wrote:

> Discard for raid4/5/6 has limitation. If discard request size is small, we do
> discard for one disk, but we need calculate parity and write parity disk.  To
> correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> true, we need do discard for one disk but write another disks, which makes the
> parity disks wear out fast. This doesn't make sense. So an efficient discard
> for raid4/5/6 should discard all data disks and parity disks, which requires
> the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> smaller than chunk_size, such pattern is almost impossible in practice. So in
> this patch, I only handle the case that A's size equals to chunk_size. That is
> discard request should be aligned to stripe size and its size is multiple of
> stripe size.
> 
> Since we can only handle request with specific alignment and size (or part of
> the request fitting stripes), we can't guarantee zero_after_discard even
> zero_after_discard is true in low level drives.
> 
> The block layer doesn't send down correctly aligned requests even correct
> discard alignment is set, so I must filter out.
> 
> For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> zero_after_discard is true for all disks, data is consistent after discard.
> Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> write data to one disk and write parity disk. The stripe could be still
> inconsistent till then depending on using data from other data disks or parity
> disks to calculate new parity. If the disk is broken, we can't restore it. So
> in this patch, we only enable discard support if all disks have
> zero_after_discard.
> 
> If discard fails in one disk, we face the similar inconsistent issue above. The
> patch will make discard follow the same path as normal write request. If
> discard fails, a resync will be scheduled to make the data consistent. This
> isn't good to have extra writes, but data consistency is important.
> 
> If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> discarded dev page should have zero filled, so the data is consistent. This
> patch will always zero dev page for discarded request stripe. This isn't
> optimal because discard request doesn't need such payload. Next patch will
> avoid it.
> 
> Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx>
> ---
>  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  drivers/md/raid5.h |    1 
>  2 files changed, 174 insertions(+), 3 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
>  				rw = WRITE_FUA;
>  			else
>  				rw = WRITE;
> +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> +				rw |= REQ_DISCARD;
>  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
>  			rw = READ;
>  		else if (test_and_clear_bit(R5_WantReplace,
> @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
>  					set_bit(R5_WantFUA, &dev->flags);
>  				if (wbi->bi_rw & REQ_SYNC)
>  					set_bit(R5_SyncIO, &dev->flags);
> -				tx = async_copy_data(1, wbi, dev->page,
> -					dev->sector, tx);
> +				if (wbi->bi_rw & REQ_DISCARD) {
> +					memset(page_address(dev->page), 0,
> +						STRIPE_SECTORS << 9);
> +					set_bit(R5_Discard, &dev->flags);
> +				} else
> +					tx = async_copy_data(1, wbi, dev->page,
> +						dev->sector, tx);
>  				wbi = r5_next_bio(wbi, dev->sector);
>  			}
>  		}
> @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
>  	pr_debug("%s: stripe %llu\n", __func__,
>  		(unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (pd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
>  	/* check if prexor is active which means only process blocks
>  	 * that are part of a read-modify-write (written)
>  	 */
> @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
>  {
>  	struct async_submit_ctl submit;
>  	struct page **blocks = percpu->scribble;
> -	int count;
> +	int count, i;
>  
>  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
>  
> +	for (i = 0; i < sh->disks; i++) {
> +		if (sh->pd_idx == i || sh->qd_idx == i)
> +			continue;
> +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> +			break;
> +	}
> +	if (i >= sh->disks) {
> +		atomic_inc(&sh->count);
> +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> +			STRIPE_SECTORS << 9);
> +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> +		ops_complete_reconstruct(sh);
> +		return;
> +	}
> +
>  	count = set_syndrome_sources(blocks, sh);
>  
>  	atomic_inc(&sh->count);
> @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
>  		release_stripe(sh);
>  }
>  
> +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> +{
> +	struct r5conf *conf = mddev->private;
> +	sector_t logical_sector, last_sector;
> +	struct stripe_head *sh;
> +	int remaining;
> +	int stripe_sectors;
> +
> +	if (mddev->reshape_position != MaxSector)
> +		/* Skip discard while reshape is happening */
> +		return;
> +
> +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> +
> +	bi->bi_next = NULL;
> +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> +
> +	stripe_sectors = conf->chunk_sectors *
> +		(conf->raid_disks - conf->max_degraded);
> +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> +			stripe_sectors);
> +	sector_div(last_sector, stripe_sectors);
> +
> +	logical_sector *= stripe_sectors;
> +	last_sector *= stripe_sectors;
> +
> +	for (;logical_sector < last_sector;
> +					logical_sector += STRIPE_SECTORS) {
> +		DEFINE_WAIT(w);
> +		sector_t new_sector;
> +		int d;
> +
> +		new_sector = raid5_compute_sector(conf, logical_sector,
> +						  0, &d, NULL);

This is pointless.  Look at the patch I posted again.  You don't need to call
raid5_compute_sector().  It essentially just divides logical_sector by
stripe_sectors.  It is cleaner not to do the multiple in the first place.

NeilBrown

> +again:
> +		sh = get_active_stripe(conf, new_sector, 0, 0, 0);
> +		prepare_to_wait(&conf->wait_for_overlap, &w,
> +						TASK_UNINTERRUPTIBLE);
> +		spin_lock_irq(&sh->stripe_lock);
> +		for (d = 0; d < conf->raid_disks; d++) {
> +			if (d == sh->pd_idx || d == sh->qd_idx)
> +				continue;
> +			if (sh->dev[d].towrite || sh->dev[d].toread) {
> +				set_bit(R5_Overlap, &sh->dev[d].flags);
> +				spin_unlock_irq(&sh->stripe_lock);
> +				release_stripe(sh);
> +				schedule();
> +				goto again;
> +			}
> +		}
> +		finish_wait(&conf->wait_for_overlap, &w);
> +		for (d = 0; d < conf->raid_disks; d++) {
> +			if (d == sh->pd_idx || d == sh->qd_idx)
> +				continue;
> +			sh->dev[d].towrite = bi;
> +			set_bit(R5_OVERWRITE, &sh->dev[d].flags);
> +			raid5_inc_bi_active_stripes(bi);
> +		}
> +		spin_unlock_irq(&sh->stripe_lock);
> +		if (conf->mddev->bitmap) {
> +			for (d = 0; d < conf->raid_disks - conf->max_degraded;
> +									d++)
> +				bitmap_startwrite(mddev->bitmap,
> +						sh->sector,
> +						STRIPE_SECTORS,
> +						0);
> +			sh->bm_seq = conf->seq_flush + 1;
> +			set_bit(STRIPE_BIT_DELAY, &sh->state);
> +		}
> +
> +		set_bit(STRIPE_HANDLE, &sh->state);
> +		clear_bit(STRIPE_DELAYED, &sh->state);
> +		if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
> +			atomic_inc(&conf->preread_active_stripes);
> +		release_stripe_plug(mddev, sh);
> +
> +		new_sector = logical_sector + STRIPE_SECTORS;
> +		if (!sector_div(new_sector, conf->chunk_sectors))
> +			logical_sector += conf->chunk_sectors *
> +				(conf->raid_disks - conf->max_degraded - 1);
> +	}
> +
> +	remaining = raid5_dec_bi_active_stripes(bi);
> +	if (remaining == 0) {
> +		md_write_end(mddev);
> +		bio_endio(bi, 0);
> +	}
> +}
> +
>  static void make_request(struct mddev *mddev, struct bio * bi)
>  {
>  	struct r5conf *conf = mddev->private;
> @@ -4089,6 +4218,11 @@ static void make_request(struct mddev *m
>  	     chunk_aligned_read(mddev,bi))
>  		return;
>  
> +	if (unlikely(bi->bi_rw & REQ_DISCARD)) {
> +		make_discard_request(mddev, bi);
> +		return;
> +	}
> +
>  	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
>  	last_sector = bi->bi_sector + (bi->bi_size>>9);
>  	bi->bi_next = NULL;
> @@ -5361,6 +5495,7 @@ static int run(struct mddev *mddev)
>  
>  	if (mddev->queue) {
>  		int chunk_size;
> +		bool discard_supported = true;
>  		/* read-ahead size must cover two whole stripes, which
>  		 * is 2 * (datadisks) * chunksize where 'n' is the
>  		 * number of raid devices
> @@ -5380,13 +5515,48 @@ static int run(struct mddev *mddev)
>  		blk_queue_io_min(mddev->queue, chunk_size);
>  		blk_queue_io_opt(mddev->queue, chunk_size *
>  				 (conf->raid_disks - conf->max_degraded));
> +		/*
> +		 * We can only discard a whole stripe. It doesn't make sense to
> +		 * discard data disk but write parity disk
> +		 */
> +		stripe = stripe * PAGE_SIZE;
> +		mddev->queue->limits.discard_alignment = stripe;
> +		mddev->queue->limits.discard_granularity = stripe;
> +		/*
> +		 * unaligned part of discard request will be ignored, so can't
> +		 * guarantee discard_zerors_data
> +		 */
> +		mddev->queue->limits.discard_zeroes_data = 0;
>  
>  		rdev_for_each(rdev, mddev) {
>  			disk_stack_limits(mddev->gendisk, rdev->bdev,
>  					  rdev->data_offset << 9);
>  			disk_stack_limits(mddev->gendisk, rdev->bdev,
>  					  rdev->new_data_offset << 9);
> +			/*
> +			 * discard_zeroes_data is required, otherwise data
> +			 * could be lost. Consider a scenario: discard a stripe
> +			 * (the stripe could be inconsistent if
> +			 * discard_zeroes_data is 0); write one disk of the
> +			 * stripe (the stripe could be inconsistent again
> +			 * depending on which disks are used to calculate
> +			 * parity); the disk is broken; The stripe data of this
> +			 * disk is lost.
> +			 */
> +			if (!blk_queue_discard(bdev_get_queue(rdev->bdev)) ||
> +			    !bdev_get_queue(rdev->bdev)->
> +						limits.discard_zeroes_data)
> +				discard_supported = false;
>  		}
> +
> +		if (discard_supported &&
> +		   mddev->queue->limits.max_discard_sectors >= stripe &&
> +		   mddev->queue->limits.discard_granularity >= stripe)
> +			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
> +						mddev->queue);
> +		else
> +			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
> +						mddev->queue);
>  	}
>  
>  	return 0;
> Index: linux/drivers/md/raid5.h
> ===================================================================
> --- linux.orig/drivers/md/raid5.h	2012-09-18 16:15:51.235353157 +0800
> +++ linux/drivers/md/raid5.h	2012-09-18 16:15:55.471299904 +0800
> @@ -298,6 +298,7 @@ enum r5dev_flags {
>  	R5_WantReplace, /* We need to update the replacement, we have read
>  			 * data in, and now is a good time to write it out.
>  			 */
> +	R5_Discard,	/* Discard the stripe */
>  };
>  
>  /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-raid" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Attachment:
signature.asc

Description: PGP signature