Re: [RFC]raid5: add an option to avoid copy data from bio to stripe cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 28 Apr 2014 14:58:41 +0800 Shaohua Li <shli@xxxxxxxxxx> wrote:

> 
> The stripe cache has two goals:
> 1. cache data, so next time if data can be found in stripe cache, disk access
> can be avoided.
> 2. stable data. data is copied from bio to stripe cache and calculated parity.
> data written to disk is from stripe cache, so if upper layer changes bio data,
> data written to disk isn't impacted.
> 
> In my environment, I can guarantee 2 will not happen. For 1, it's not common
> too. block plug mechanism will dispatch a bunch of sequentail small requests
> together. And since I'm using SSD, I'm using small chunk size. It's rare case
> stripe cache is really useful.
> 
> So I'd like to avoid the copy from bio to stripe cache and it's very helpful
> for performance. In my 1M randwrite tests, avoid the copy can increase the
> performance more than 30%.
> 
> Of course, this shouldn't be enabled by default, so I added an option to
> control it.

I'm happy to avoid copying when we know that we can.

I'm not really happy about using a sysfs attribute to control it.

How do you guarantee that '2' won't happen?

BTW I don't see '1' as important.  The stripe cache is really for gathering
writes together to increase the chance of full-stripe writes, and for
handling synchronisation between IO and resync/reshape/etc. The copying is
primarily for stability.

Thanks,

NeilBrown


> 
> Signed-off-by: Shaohua Li <shli@xxxxxxxxxxxx>
> ---
>  drivers/md/raid5.c |   92 +++++++++++++++++++++++++++++++++++++++++++++--------
>  drivers/md/raid5.h |    4 +-
>  2 files changed, 82 insertions(+), 14 deletions(-)
> 
> Index: linux/drivers/md/raid5.c
> ===================================================================
> --- linux.orig/drivers/md/raid5.c	2014-04-28 14:02:18.025349590 +0800
> +++ linux/drivers/md/raid5.c	2014-04-28 14:02:18.009349792 +0800
> @@ -479,6 +479,7 @@ static void shrink_buffers(struct stripe
>  	int num = sh->raid_conf->pool_size;
>  
>  	for (i = 0; i < num ; i++) {
> +		BUG_ON(sh->dev[i].page != sh->dev[i].orig_page);
>  		p = sh->dev[i].page;
>  		if (!p)
>  			continue;
> @@ -499,6 +500,7 @@ static int grow_buffers(struct stripe_he
>  			return 1;
>  		}
>  		sh->dev[i].page = page;
> +		sh->dev[i].orig_page = page;
>  	}
>  	return 0;
>  }
> @@ -855,6 +857,9 @@ static void ops_run_io(struct stripe_hea
>  			if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
>  				bi->bi_rw |= REQ_NOMERGE;
>  
> +			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
> +				BUG_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
> +			sh->dev[i].vec.bv_page = sh->dev[i].page;
>  			bi->bi_vcnt = 1;
>  			bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
>  			bi->bi_io_vec[0].bv_offset = 0;
> @@ -899,6 +904,9 @@ static void ops_run_io(struct stripe_hea
>  			else
>  				rbi->bi_iter.bi_sector = (sh->sector
>  						  + rrdev->data_offset);
> +			if (test_bit(R5_SkipCopy, &sh->dev[i].flags))
> +				BUG_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
> +			sh->dev[i].rvec.bv_page = sh->dev[i].page;
>  			rbi->bi_vcnt = 1;
>  			rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
>  			rbi->bi_io_vec[0].bv_offset = 0;
> @@ -927,8 +935,9 @@ static void ops_run_io(struct stripe_hea
>  }
>  
>  static struct dma_async_tx_descriptor *
> -async_copy_data(int frombio, struct bio *bio, struct page *page,
> -	sector_t sector, struct dma_async_tx_descriptor *tx)
> +async_copy_data(int frombio, struct bio *bio, struct page **page,
> +	sector_t sector, struct dma_async_tx_descriptor *tx,
> +	struct stripe_head *sh)
>  {
>  	struct bio_vec bvl;
>  	struct bvec_iter iter;
> @@ -965,11 +974,16 @@ async_copy_data(int frombio, struct bio
>  		if (clen > 0) {
>  			b_offset += bvl.bv_offset;
>  			bio_page = bvl.bv_page;
> -			if (frombio)
> -				tx = async_memcpy(page, bio_page, page_offset,
> +			if (frombio) {
> +				if (sh->raid_conf->skip_copy &&
> +				    b_offset == 0 && page_offset == 0 &&
> +				    clen == STRIPE_SIZE)
> +					*page = bio_page;
> +				else
> +					tx = async_memcpy(*page, bio_page, page_offset,
>  						  b_offset, clen, &submit);
> -			else
> -				tx = async_memcpy(bio_page, page, b_offset,
> +			} else
> +				tx = async_memcpy(bio_page, *page, b_offset,
>  						  page_offset, clen, &submit);
>  		}
>  		/* chain the operations */
> @@ -1045,8 +1059,8 @@ static void ops_run_biofill(struct strip
>  			spin_unlock_irq(&sh->stripe_lock);
>  			while (rbi && rbi->bi_iter.bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
> -				tx = async_copy_data(0, rbi, dev->page,
> -					dev->sector, tx);
> +				tx = async_copy_data(0, rbi, &dev->page,
> +					dev->sector, tx, sh);
>  				rbi = r5_next_bio(rbi, dev->sector);
>  			}
>  		}
> @@ -1384,6 +1398,7 @@ ops_run_biodrain(struct stripe_head *sh,
>  			BUG_ON(dev->written);
>  			wbi = dev->written = chosen;
>  			spin_unlock_irq(&sh->stripe_lock);
> +			BUG_ON(dev->page != dev->orig_page);
>  
>  			while (wbi && wbi->bi_iter.bi_sector <
>  				dev->sector + STRIPE_SECTORS) {
> @@ -1393,9 +1408,15 @@ ops_run_biodrain(struct stripe_head *sh,
>  					set_bit(R5_SyncIO, &dev->flags);
>  				if (wbi->bi_rw & REQ_DISCARD)
>  					set_bit(R5_Discard, &dev->flags);
> -				else
> -					tx = async_copy_data(1, wbi, dev->page,
> -						dev->sector, tx);
> +				else {
> +					tx = async_copy_data(1, wbi, &dev->page,
> +						dev->sector, tx, sh);
> +					if (dev->page != dev->orig_page) {
> +						set_bit(R5_SkipCopy, &dev->flags);
> +						clear_bit(R5_UPTODATE, &dev->flags);
> +						clear_bit(R5_OVERWRITE, &dev->flags);
> +					}
> +				}
>  				wbi = r5_next_bio(wbi, dev->sector);
>  			}
>  		}
> @@ -1426,7 +1447,7 @@ static void ops_complete_reconstruct(voi
>  		struct r5dev *dev = &sh->dev[i];
>  
>  		if (dev->written || i == pd_idx || i == qd_idx) {
> -			if (!discard)
> +			if (!discard && !test_bit(R5_SkipCopy, &dev->flags))
>  				set_bit(R5_UPTODATE, &dev->flags);
>  			if (fua)
>  				set_bit(R5_WantFUA, &dev->flags);
> @@ -2750,6 +2771,11 @@ handle_failed_stripe(struct r5conf *conf
>  		/* and fail all 'written' */
>  		bi = sh->dev[i].written;
>  		sh->dev[i].written = NULL;
> +		if (test_and_clear_bit(R5_SkipCopy, &sh->dev[i].flags)) {
> +			BUG_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags));
> +			sh->dev[i].page = sh->dev[i].orig_page;
> +		}
> +
>  		if (bi) bitmap_end = 1;
>  		while (bi && bi->bi_iter.bi_sector <
>  		       sh->dev[i].sector + STRIPE_SECTORS) {
> @@ -2991,12 +3017,17 @@ static void handle_stripe_clean_event(st
>  			dev = &sh->dev[i];
>  			if (!test_bit(R5_LOCKED, &dev->flags) &&
>  			    (test_bit(R5_UPTODATE, &dev->flags) ||
> -			     test_bit(R5_Discard, &dev->flags))) {
> +			     test_bit(R5_Discard, &dev->flags) ||
> +			     test_bit(R5_SkipCopy, &dev->flags))) {
>  				/* We can return any write requests */
>  				struct bio *wbi, *wbi2;
>  				pr_debug("Return write for disc %d\n", i);
>  				if (test_and_clear_bit(R5_Discard, &dev->flags))
>  					clear_bit(R5_UPTODATE, &dev->flags);
> +				if (test_and_clear_bit(R5_SkipCopy, &dev->flags)) {
> +					BUG_ON(test_bit(R5_UPTODATE, &dev->flags));
> +					dev->page = dev->orig_page;
> +				}
>  				wbi = dev->written;
>  				dev->written = NULL;
>  				while (wbi && wbi->bi_iter.bi_sector <
> @@ -3015,6 +3046,8 @@ static void handle_stripe_clean_event(st
>  						0);
>  			} else if (test_bit(R5_Discard, &dev->flags))
>  				discard_pending = 1;
> +			BUG_ON(test_bit(R5_SkipCopy, &dev->flags));
> +			BUG_ON(dev->page != dev->orig_page);
>  		}
>  	if (!discard_pending &&
>  	    test_bit(R5_Discard, &sh->dev[sh->pd_idx].flags)) {
> @@ -5355,6 +5388,38 @@ raid5_preread_bypass_threshold = __ATTR(
>  					raid5_store_preread_threshold);
>  
>  static ssize_t
> +raid5_show_skip_copy(struct mddev *mddev, char *page)
> +{
> +	struct r5conf *conf = mddev->private;
> +	if (conf)
> +		return sprintf(page, "%d\n", conf->skip_copy);
> +	else
> +		return 0;
> +}
> +
> +static ssize_t
> +raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
> +{
> +	struct r5conf *conf = mddev->private;
> +	unsigned long new;
> +	if (len >= PAGE_SIZE)
> +		return -EINVAL;
> +	if (!conf)
> +		return -ENODEV;
> +
> +	if (kstrtoul(page, 10, &new))
> +		return -EINVAL;
> +	conf->skip_copy = new;
> +	return len;
> +}
> +
> +static struct md_sysfs_entry
> +raid5_skip_copy = __ATTR(skip_copy, S_IRUGO | S_IWUSR,
> +					raid5_show_skip_copy,
> +					raid5_store_skip_copy);
> +
> +
> +static ssize_t
>  stripe_cache_active_show(struct mddev *mddev, char *page)
>  {
>  	struct r5conf *conf = mddev->private;
> @@ -5439,6 +5504,7 @@ static struct attribute *raid5_attrs[] =
>  	&raid5_stripecache_active.attr,
>  	&raid5_preread_bypass_threshold.attr,
>  	&raid5_group_thread_cnt.attr,
> +	&raid5_skip_copy.attr,
>  	NULL,
>  };
>  static struct attribute_group raid5_attrs_group = {
> Index: linux/drivers/md/raid5.h
> ===================================================================
> --- linux.orig/drivers/md/raid5.h	2014-04-28 14:02:18.025349590 +0800
> +++ linux/drivers/md/raid5.h	2014-04-28 14:02:18.009349792 +0800
> @@ -232,7 +232,7 @@ struct stripe_head {
>  		 */
>  		struct bio	req, rreq;
>  		struct bio_vec	vec, rvec;
> -		struct page	*page;
> +		struct page	*page, *orig_page;
>  		struct bio	*toread, *read, *towrite, *written;
>  		sector_t	sector;			/* sector of this page */
>  		unsigned long	flags;
> @@ -299,6 +299,7 @@ enum r5dev_flags {
>  			 * data in, and now is a good time to write it out.
>  			 */
>  	R5_Discard,	/* Discard the stripe */
> +	R5_SkipCopy,	/* Don't copy data from bio to stripe cache */
>  };
>  
>  /*
> @@ -436,6 +437,7 @@ struct r5conf {
>  	atomic_t		pending_full_writes; /* full write backlog */
>  	int			bypass_count; /* bypassed prereads */
>  	int			bypass_threshold; /* preread nice */
> +	int			skip_copy; /* Don't copy data from bio to stripe cache */
>  	struct list_head	*last_hold; /* detect hold_list promotions */
>  
>  	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux