Re: [patch 1/2]MD: raid5 trim support

Shaohua Li <shli@xxxxxxxxxx> · Thu, 20 Sep 2012 09:36:42 +0800



On Thu, Sep 20, 2012 at 11:15:17AM +1000, NeilBrown wrote:
> On Tue, 18 Sep 2012 16:25:11 +0800 Shaohua Li <shli@xxxxxxxxxx> wrote:
> 
> > Discard for raid4/5/6 has limitation. If discard request size is small, we do
> > discard for one disk, but we need calculate parity and write parity disk.  To
> > correctly calculate parity, zero_after_discard must be guaranteed. Even it's
> > true, we need do discard for one disk but write another disks, which makes the
> > parity disks wear out fast. This doesn't make sense. So an efficient discard
> > for raid4/5/6 should discard all data disks and parity disks, which requires
> > the write pattern to be (A, A+chunk_size, A+chunk_size*2...). If A's size is
> > smaller than chunk_size, such pattern is almost impossible in practice. So in
> > this patch, I only handle the case that A's size equals to chunk_size. That is
> > discard request should be aligned to stripe size and its size is multiple of
> > stripe size.
> > 
> > Since we can only handle request with specific alignment and size (or part of
> > the request fitting stripes), we can't guarantee zero_after_discard even
> > zero_after_discard is true in low level drives.
> > 
> > The block layer doesn't send down correctly aligned requests even correct
> > discard alignment is set, so I must filter out.
> > 
> > For raid4/5/6 parity calculation, if data is 0, parity is 0. So if
> > zero_after_discard is true for all disks, data is consistent after discard.
> > Otherwise, data might be lost. Let's consider a scenario: discard a stripe,
> > write data to one disk and write parity disk. The stripe could be still
> > inconsistent till then depending on using data from other data disks or parity
> > disks to calculate new parity. If the disk is broken, we can't restore it. So
> > in this patch, we only enable discard support if all disks have
> > zero_after_discard.
> > 
> > If discard fails in one disk, we face the similar inconsistent issue above. The
> > patch will make discard follow the same path as normal write request. If
> > discard fails, a resync will be scheduled to make the data consistent. This
> > isn't good to have extra writes, but data consistency is important.
> > 
> > If a subsequent read/write request hits raid5 cache of a discarded stripe, the
> > discarded dev page should have zero filled, so the data is consistent. This
> > patch will always zero dev page for discarded request stripe. This isn't
> > optimal because discard request doesn't need such payload. Next patch will
> > avoid it.
> > 
> > Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx>
> > ---
> >  drivers/md/raid5.c |  176 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  drivers/md/raid5.h |    1 
> >  2 files changed, 174 insertions(+), 3 deletions(-)
> > 
> > Index: linux/drivers/md/raid5.c
> > ===================================================================
> > --- linux.orig/drivers/md/raid5.c	2012-09-18 16:15:51.219353357 +0800
> > +++ linux/drivers/md/raid5.c	2012-09-18 16:15:55.471299904 +0800
> > @@ -547,6 +547,8 @@ static void ops_run_io(struct stripe_hea
> >  				rw = WRITE_FUA;
> >  			else
> >  				rw = WRITE;
> > +			if (test_and_clear_bit(R5_Discard, &sh->dev[i].flags))
> > +				rw |= REQ_DISCARD;
> >  		} else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
> >  			rw = READ;
> >  		else if (test_and_clear_bit(R5_WantReplace,
> > @@ -1170,8 +1172,13 @@ ops_run_biodrain(struct stripe_head *sh,
> >  					set_bit(R5_WantFUA, &dev->flags);
> >  				if (wbi->bi_rw & REQ_SYNC)
> >  					set_bit(R5_SyncIO, &dev->flags);
> > -				tx = async_copy_data(1, wbi, dev->page,
> > -					dev->sector, tx);
> > +				if (wbi->bi_rw & REQ_DISCARD) {
> > +					memset(page_address(dev->page), 0,
> > +						STRIPE_SECTORS << 9);
> > +					set_bit(R5_Discard, &dev->flags);
> > +				} else
> > +					tx = async_copy_data(1, wbi, dev->page,
> > +						dev->sector, tx);
> >  				wbi = r5_next_bio(wbi, dev->sector);
> >  			}
> >  		}
> > @@ -1237,6 +1244,20 @@ ops_run_reconstruct5(struct stripe_head
> >  	pr_debug("%s: stripe %llu\n", __func__,
> >  		(unsigned long long)sh->sector);
> >  
> > +	for (i = 0; i < sh->disks; i++) {
> > +		if (pd_idx == i)
> > +			continue;
> > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > +			break;
> > +	}
> > +	if (i >= sh->disks) {
> > +		atomic_inc(&sh->count);
> > +		memset(page_address(sh->dev[pd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		set_bit(R5_Discard, &sh->dev[pd_idx].flags);
> > +		ops_complete_reconstruct(sh);
> > +		return;
> > +	}
> >  	/* check if prexor is active which means only process blocks
> >  	 * that are part of a read-modify-write (written)
> >  	 */
> > @@ -1281,10 +1302,28 @@ ops_run_reconstruct6(struct stripe_head
> >  {
> >  	struct async_submit_ctl submit;
> >  	struct page **blocks = percpu->scribble;
> > -	int count;
> > +	int count, i;
> >  
> >  	pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
> >  
> > +	for (i = 0; i < sh->disks; i++) {
> > +		if (sh->pd_idx == i || sh->qd_idx == i)
> > +			continue;
> > +		if (!test_bit(R5_Discard, &sh->dev[i].flags))
> > +			break;
> > +	}
> > +	if (i >= sh->disks) {
> > +		atomic_inc(&sh->count);
> > +		memset(page_address(sh->dev[sh->pd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		memset(page_address(sh->dev[sh->qd_idx].page), 0,
> > +			STRIPE_SECTORS << 9);
> > +		set_bit(R5_Discard, &sh->dev[sh->pd_idx].flags);
> > +		set_bit(R5_Discard, &sh->dev[sh->qd_idx].flags);
> > +		ops_complete_reconstruct(sh);
> > +		return;
> > +	}
> > +
> >  	count = set_syndrome_sources(blocks, sh);
> >  
> >  	atomic_inc(&sh->count);
> > @@ -4067,6 +4106,96 @@ static void release_stripe_plug(struct m
> >  		release_stripe(sh);
> >  }
> >  
> > +static void make_discard_request(struct mddev *mddev, struct bio *bi)
> > +{
> > +	struct r5conf *conf = mddev->private;
> > +	sector_t logical_sector, last_sector;
> > +	struct stripe_head *sh;
> > +	int remaining;
> > +	int stripe_sectors;
> > +
> > +	if (mddev->reshape_position != MaxSector)
> > +		/* Skip discard while reshape is happening */
> > +		return;
> > +
> > +	logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
> > +	last_sector = bi->bi_sector + (bi->bi_size>>9);
> > +
> > +	bi->bi_next = NULL;
> > +	bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
> > +
> > +	stripe_sectors = conf->chunk_sectors *
> > +		(conf->raid_disks - conf->max_degraded);
> > +	logical_sector = DIV_ROUND_UP_SECTOR_T(logical_sector,
> > +			stripe_sectors);
> > +	sector_div(last_sector, stripe_sectors);
> > +
> > +	logical_sector *= stripe_sectors;
> > +	last_sector *= stripe_sectors;
> > +
> > +	for (;logical_sector < last_sector;
> > +					logical_sector += STRIPE_SECTORS) {
> > +		DEFINE_WAIT(w);
> > +		sector_t new_sector;
> > +		int d;
> > +
> > +		new_sector = raid5_compute_sector(conf, logical_sector,
> > +						  0, &d, NULL);
> 
> This is pointless.  Look at the patch I posted again.  You don't need to call
> raid5_compute_sector().  It essentially just divides logical_sector by
> stripe_sectors.  It is cleaner not to do the multiple in the first place.

in my test, without it, wrong sectors are trimmed.
--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html