Re: [RFC 3/5] r5cache: look up stripe cache for chunk_aligned_read

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, May 27 2016, Song Liu wrote:

> This is the read part of raid5 cache (r5cache).
>
> In raid456, when the array is in-sync, the md layer bypasses stripe
> cache for chunk_aligned_read(). However, with write back cache,
> data in the RAID disks may not be uptodate. Therefore, it is necessary
> to search the stripe cache latest data.
>
> With this patch, raid5_read_one_chunk() looks up data in stripe cache.
> The outcome of this lookup could be read_full_hit (all data of the
> chunk are in stripe cache), read_partial_hit (only part of the chunk
> is in stripe cache), or read_miss (no data of the chunk in stripe cache).
>
> For read_full_hit, raid5_read_one_chunk returns data directly from
> stripe cache; for read_miss, raid5_read_one_chunk reads all data from
> the disk; for read_partial_hit, raid5_read_one_chunk reads data from
> disk, and amends the data with data in stripe cache in endio
> (r5c_chunk_aligned_read_endio).

I wonder if all this complexity is really justified.
If we cannot bypass the cache, then we can just perform the read using
the cache with the existing code.  That will be slower than the direct
bypass, but is it so much slower that the complexity is needed.

Alternately.... is a full hit at all likely?  If data is in the stripe
cache, then there is a good chance it is in the page cache too, so it
won't be read.
If we assume that all reads will either be partial-hits or misses, then
your approach will always read from the disk, then add updates (possibly
none) from the cache.  In that case it might be simpler to do exactly
that.
i.e. schedule the read and then when that finishes, find all the stripes
which overlap and if they contain unwritten data, attach the bio and
queue for normal handling.

I suspect the code would end up being much simpler as it would reuse
more existing code.

>
> Sysfs entry is added to show statistics of read_full_hits,
> read_partial_hits, and read_misses.

I'm not sure putting statistics like this in /sys is a good idea.  If
they are just for debugging, then put them in debugfs.

>
> Signed-off-by: Song Liu <songliubraving@xxxxxx>
> Signed-off-by: Shaohua Li <shli@xxxxxx>
> ---
>  drivers/md/raid5-cache.c | 238 +++++++++++++++++++++++++++++++++++++++++++++++
>  drivers/md/raid5.c       |  23 ++++-
>  drivers/md/raid5.h       |   6 ++
>  3 files changed, 265 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index e889e2d..5f0d96f 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -40,8 +40,15 @@
>   */
>  #define R5L_POOL_SIZE	4
>  
> +struct r5c_cache {
> +	atomic64_t read_full_hits;	/* the whole chunk in cache */
> +	atomic64_t read_partial_hits;	/* some pages of the chunk in cache */
> +	atomic64_t read_misses;		/* the whold chunk is not in cache */
> +};

If this structure just holds statistics, then the name of the structure
should indicate that.  "r5c_cache" makes it seem like it is a cache...

> +
>  struct r5l_log {
>  	struct md_rdev *rdev;
> +	struct r5c_cache cache;
>  
>  	u32 uuid_checksum;
>  
> @@ -134,6 +141,21 @@ enum r5l_io_unit_state {
>  	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
>  };
>  
> +struct r5c_chunk_map {
> +	int sh_count;
> +	struct r5conf *conf;
> +	struct bio *parent_bi;
> +	int dd_idx;
> +	struct stripe_head *sh_array[0];
> +};
> +
> +static void init_r5c_cache(struct r5conf *conf, struct r5c_cache *cache)
> +{
> +	atomic64_set(&cache->read_full_hits, 0);
> +	atomic64_set(&cache->read_partial_hits, 0);
> +	atomic64_set(&cache->read_misses, 0);
> +}
> +
>  static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
>  {
>  	start += inc;
> @@ -1120,6 +1142,220 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
>  	set_bit(MD_CHANGE_DEVS, &mddev->flags);
>  }
>  
> +/* TODO: use async copy */
> +static void r5c_copy_data_to_bvec(struct r5dev *rdev, int sh_offset,
> +				  struct bio_vec *bvec, int bvec_offset, int len)
> +{
> +	/* We always copy data from orig_page. This is because in R-M-W, we use
> +	 * page to do prexor of parity */
> +	void *src_p = kmap_atomic(rdev->orig_page);
> +	void *dst_p = kmap_atomic(bvec->bv_page);
> +	memcpy(dst_p + bvec_offset, src_p + sh_offset, len);
> +	kunmap_atomic(dst_p);
> +	kunmap_atomic(src_p);
> +}
> +
> +/*
> + * copy data from a chunk_map to a bio
> + */
> +static void r5c_copy_chunk_map_to_bio(struct r5c_chunk_map *chunk_map,
> +			 struct bio *bio)
> +{
> +	struct bvec_iter iter;
> +	struct bio_vec bvec;
> +	int sh_idx;
> +	unsigned sh_offset;
> +
> +	sh_idx = 0;
> +	sh_offset = (bio->bi_iter.bi_sector & ((sector_t)STRIPE_SECTORS-1)) << 9;
> +
> +	/*
> +	 * If bio is not page aligned, the chunk_map will have 1 more sh than bvecs
> +	 * in the bio. Chunk_map may also have NULL-sh. To copy the right data, we
> +	 * need to walk through the chunk_map carefully. In this implementation,
> +	 * bvec/bvec_offset always matches with sh_array[sh_idx]/sh_offset.
> +	 *
> +	 * In the following example, the nested loop will run 4 times; and
> +	 * r5c_copy_data_to_bvec will be called for the first and last iteration.
> +	 *
> +	 *             --------------------------------
> +	 * chunk_map   | valid sh |  NULL  | valid sh |
> +	 *             --------------------------------
> +	 *                   ---------------------
> +	 * bio               |         |         |
> +	 *                   ---------------------
> +	 *
> +	 *                   |    |    |   |     |
> +	 * copy_data         | Y  | N  | N |  Y  |
> +	 */
> +	bio_for_each_segment(bvec, bio, iter) {
> +		int len;
> +		unsigned bvec_offset = bvec.bv_offset;
> +		while (bvec_offset < PAGE_SIZE) {
> +			len = min_t(unsigned, PAGE_SIZE - bvec_offset, PAGE_SIZE - sh_offset);
> +			if (chunk_map->sh_array[sh_idx])
> +				r5c_copy_data_to_bvec(&chunk_map->sh_array[sh_idx]->dev[chunk_map->dd_idx], sh_offset,
> +						      &bvec, bvec_offset, len);
> +			bvec_offset += len;
> +			sh_offset += len;
> +			if (sh_offset == PAGE_SIZE) {
> +				sh_idx += 1;
> +				sh_offset = 0;
> +			}
> +		}
> +	}
> +	return;
> +}
> +
> +/*
> + * release stripes in chunk_map and free the chunk_map
> + */
> +static void free_r5c_chunk_map(struct r5c_chunk_map *chunk_map)
> +{
> +	unsigned sh_idx;
> +	struct stripe_head *sh;
> +
> +	for (sh_idx = 0; sh_idx < chunk_map->sh_count; ++sh_idx) {
> +		sh = chunk_map->sh_array[sh_idx];
> +		if (sh) {
> +			set_bit(STRIPE_HANDLE, &sh->state);
> +			raid5_release_stripe(sh);
> +		}
> +	}
> +	kfree(chunk_map);
> +}
> +
> +static void r5c_chunk_aligned_read_endio(struct bio *bio)
> +{
> +	struct r5c_chunk_map *chunk_map = (struct r5c_chunk_map *) bio->bi_private;
> +	struct bio *parent_bi = chunk_map->parent_bi;
> +
> +	r5c_copy_chunk_map_to_bio(chunk_map, bio);
> +	free_r5c_chunk_map(chunk_map);
> +	bio_put(bio);
> +	bio_endio(parent_bi);
> +}
> +
> +/*
> + * look up bio in stripe cache
> + * return raid_bio	-> no data in cache, read the chunk from disk
> + * return new r5c_bio	-> partial data in cache, read from disk, and amend in r5c_align_endio
> + * return NULL		-> all data in cache, no need to read disk
> + */
> +struct bio *r5c_lookup_chunk(struct r5l_log *log, struct bio *raid_bio)
> +{
> +	struct r5conf *conf;
> +	sector_t logical_sector;
> +	sector_t first_stripe, last_stripe;  /* first (inclusive) stripe and last (exclusive) */
> +	int dd_idx;
> +	struct stripe_head *sh;
> +	unsigned sh_count, sh_idx, sh_cached;
> +	struct r5c_chunk_map *chunk_map;
> +	struct bio *r5c_bio;
> +	int hash;
> +	unsigned long flags;
> +
> +	if (!log)
> +		return raid_bio;
> +
> +	conf = log->rdev->mddev->private;
> +
> +	logical_sector = raid_bio->bi_iter.bi_sector &
> +		~((sector_t)STRIPE_SECTORS-1);
> +	sh_count = DIV_ROUND_UP_SECTOR_T(bio_end_sector(raid_bio) - logical_sector, STRIPE_SECTORS);
> +
> +	first_stripe = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL);
> +	last_stripe = first_stripe + STRIPE_SECTORS * sh_count;
> +
> +	chunk_map = kzalloc(sizeof(struct r5c_chunk_map) + sh_count * sizeof(struct stripe_head*), GFP_NOIO);
> +	sh_cached = 0;
> +
> +	for (sh_idx = 0; sh_idx < sh_count; ++sh_idx) {
> +		hash = stripe_hash_locks_hash(first_stripe + sh_idx * STRIPE_SECTORS);
> +		spin_lock_irqsave(conf->hash_locks + hash, flags);
> +		sh = __find_stripe(conf, first_stripe + sh_idx * STRIPE_SECTORS, conf->generation);
> +		if (sh &&
> +		    test_bit(R5_UPTODATE, &sh->dev[dd_idx].flags)) {
> +			if (!atomic_inc_not_zero(&sh->count)) {
> +				spin_lock(&conf->device_lock);
> +				if (!atomic_read(&sh->count)) {
> +					if (!test_bit(STRIPE_HANDLE, &sh->state))
> +						atomic_inc(&conf->active_stripes);
> +					BUG_ON(list_empty(&sh->lru) &&
> +					       !test_bit(STRIPE_EXPANDING, &sh->state));
> +					list_del_init(&sh->lru);
> +					if (sh->group) {
> +						sh->group->stripes_cnt--;
> +						sh->group = NULL;
> +					}
> +				}
> +				atomic_inc(&sh->count);
> +				spin_unlock(&conf->device_lock);
> +			}
> +			chunk_map->sh_array[sh_idx] = sh;
> +			++sh_cached;
> +		}
> +		spin_unlock_irqrestore(conf->hash_locks + hash, flags);
> +	}
> +
> +	if (sh_cached == 0) {
> +		atomic64_inc(&log->cache.read_misses);
> +		kfree(chunk_map);
> +		return raid_bio;
> +	}
> +
> +	chunk_map->sh_count = sh_count;
> +	chunk_map->dd_idx = dd_idx;
> +
> +	if (sh_cached == sh_count) {
> +		atomic64_inc(&log->cache.read_full_hits);
> +		r5c_copy_chunk_map_to_bio(chunk_map, raid_bio);
> +		free_r5c_chunk_map(chunk_map);
> +		bio_endio(raid_bio);
> +		return NULL;
> +	}
> +
> +	chunk_map->parent_bi = raid_bio;
> +	chunk_map->conf = conf;
> +
> +	atomic64_inc(&log->cache.read_partial_hits);
> +
> +	/* TODO: handle bio_clone failure? */
> +	r5c_bio = bio_clone_mddev(raid_bio, GFP_NOIO, log->rdev->mddev);
> +
> +	r5c_bio->bi_private = chunk_map;
> +	r5c_bio->bi_end_io = r5c_chunk_aligned_read_endio;
> +
> +	return r5c_bio;
> +}
> +
> +ssize_t
> +r5c_stat_show(struct mddev *mddev, char* page)
> +{
> +	struct r5conf *conf = mddev->private;
> +	struct r5l_log *log;
> +	int ret = 0;
> +
> +	if (!conf)
> +		return 0;
> +
> +	log = conf->log;
> +
> +	if (!log)
> +		return 0;
> +
> +	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_full_hits: %llu\n",
> +			(unsigned long long) atomic64_read(&log->cache.read_full_hits));
> +
> +	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_partial_hits: %llu\n",
> +			(unsigned long long) atomic64_read(&log->cache.read_partial_hits));
> +
> +	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_misses: %llu\n",
> +			(unsigned long long) atomic64_read(&log->cache.read_misses));
> +
> +	return ret;
> +}
> +
>  static int r5l_load_log(struct r5l_log *log)
>  {
>  	struct md_rdev *rdev = log->rdev;
> @@ -1239,6 +1475,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
>  	INIT_LIST_HEAD(&log->no_space_stripes);
>  	spin_lock_init(&log->no_space_stripes_lock);
>  
> +	init_r5c_cache(conf, &log->cache);
> +
>  	if (r5l_load_log(log))
>  		goto error;
>  
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index dc24b664..cdd9c4b 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -503,7 +503,7 @@ retry:
>  	set_bit(STRIPE_BATCH_READY, &sh->state);
>  }
>  
> -static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
> +struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
>  					 short generation)
>  {
>  	struct stripe_head *sh;
> @@ -515,6 +515,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
>  	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
>  	return NULL;
>  }
> +EXPORT_SYMBOL(__find_stripe);

Why are you exporting this?  raid5-cache is not a separate module.
If you were going the export it the name would need to be changed to
include "md_".


NeilBrown


>  
>  /*
>   * Need to check if array has failed when deciding whether to:
> @@ -4726,7 +4727,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>  {
>  	struct r5conf *conf = mddev->private;
>  	int dd_idx;
> -	struct bio* align_bi;
> +	struct bio *align_bi;
> +	struct bio *r5c_bio;
>  	struct md_rdev *rdev;
>  	sector_t end_sector;
>  
> @@ -4734,6 +4736,18 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
>  		pr_debug("%s: non aligned\n", __func__);
>  		return 0;
>  	}
> +
> +	r5c_bio = r5c_lookup_chunk(conf->log, raid_bio);
> +
> +	if (r5c_bio == NULL) {
> +		pr_debug("Read all data from stripe cache\n");
> +		return 1;
> +	} else if (r5c_bio == raid_bio)
> +		pr_debug("No data in stripe cache, read all from disk\n");
> +	else {
> +		pr_debug("Partial data in stripe cache, read and amend\n");
> +		raid_bio = r5c_bio;
> +	}
>  	/*
>  	 * use bio_clone_mddev to make a copy of the bio
>  	 */
> @@ -6157,6 +6171,10 @@ raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
>  				raid5_show_group_thread_cnt,
>  				raid5_store_group_thread_cnt);
>  
> +static struct md_sysfs_entry
> +r5c_cache_stats = __ATTR(r5c_cache_stats, S_IRUGO,
> +			 r5c_stat_show, NULL);
> +
>  static struct attribute *raid5_attrs[] =  {
>  	&raid5_stripecache_size.attr,
>  	&raid5_stripecache_active.attr,
> @@ -6164,6 +6182,7 @@ static struct attribute *raid5_attrs[] =  {
>  	&raid5_group_thread_cnt.attr,
>  	&raid5_skip_copy.attr,
>  	&raid5_rmw_level.attr,
> +	&r5c_cache_stats.attr,
>  	NULL,
>  };
>  static struct attribute_group raid5_attrs_group = {
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 3b68d4f..de11514 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -690,4 +690,10 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
>  extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
>  extern void r5l_quiesce(struct r5l_log *log, int state);
>  extern bool r5l_log_disk_error(struct r5conf *conf);
> +
> +extern struct bio *r5c_lookup_chunk(struct r5l_log *log, struct bio *raid_bio);
> +extern struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
> +					 short generation);
> +
> +extern ssize_t r5c_stat_show(struct mddev *mddev, char* page);
>  #endif
> -- 
> 2.8.0.rc2

Attachment: signature.asc
Description: PGP signature


[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux