[RFC 3/5] r5cache: look up stripe cache for chunk_aligned_read

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



This is the read part of raid5 cache (r5cache).

In raid456, when the array is in-sync, the md layer bypasses stripe
cache for chunk_aligned_read(). However, with write back cache,
data in the RAID disks may not be uptodate. Therefore, it is necessary
to search the stripe cache latest data.

With this patch, raid5_read_one_chunk() looks up data in stripe cache.
The outcome of this lookup could be read_full_hit (all data of the
chunk are in stripe cache), read_partial_hit (only part of the chunk
is in stripe cache), or read_miss (no data of the chunk in stripe cache).

For read_full_hit, raid5_read_one_chunk returns data directly from
stripe cache; for read_miss, raid5_read_one_chunk reads all data from
the disk; for read_partial_hit, raid5_read_one_chunk reads data from
disk, and amends the data with data in stripe cache in endio
(r5c_chunk_aligned_read_endio).

Sysfs entry is added to show statistics of read_full_hits,
read_partial_hits, and read_misses.

Signed-off-by: Song Liu <songliubraving@xxxxxx>
Signed-off-by: Shaohua Li <shli@xxxxxx>
---
 drivers/md/raid5-cache.c | 238 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid5.c       |  23 ++++-
 drivers/md/raid5.h       |   6 ++
 3 files changed, 265 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index e889e2d..5f0d96f 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -40,8 +40,15 @@
  */
 #define R5L_POOL_SIZE	4
 
+struct r5c_cache {
+	atomic64_t read_full_hits;	/* the whole chunk in cache */
+	atomic64_t read_partial_hits;	/* some pages of the chunk in cache */
+	atomic64_t read_misses;		/* the whold chunk is not in cache */
+};
+
 struct r5l_log {
 	struct md_rdev *rdev;
+	struct r5c_cache cache;
 
 	u32 uuid_checksum;
 
@@ -134,6 +141,21 @@ enum r5l_io_unit_state {
 	IO_UNIT_STRIPE_END = 3,	/* stripes data finished writing to raid */
 };
 
+struct r5c_chunk_map {
+	int sh_count;
+	struct r5conf *conf;
+	struct bio *parent_bi;
+	int dd_idx;
+	struct stripe_head *sh_array[0];
+};
+
+static void init_r5c_cache(struct r5conf *conf, struct r5c_cache *cache)
+{
+	atomic64_set(&cache->read_full_hits, 0);
+	atomic64_set(&cache->read_partial_hits, 0);
+	atomic64_set(&cache->read_misses, 0);
+}
+
 static sector_t r5l_ring_add(struct r5l_log *log, sector_t start, sector_t inc)
 {
 	start += inc;
@@ -1120,6 +1142,220 @@ static void r5l_write_super(struct r5l_log *log, sector_t cp)
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 }
 
+/* TODO: use async copy */
+static void r5c_copy_data_to_bvec(struct r5dev *rdev, int sh_offset,
+				  struct bio_vec *bvec, int bvec_offset, int len)
+{
+	/* We always copy data from orig_page. This is because in R-M-W, we use
+	 * page to do prexor of parity */
+	void *src_p = kmap_atomic(rdev->orig_page);
+	void *dst_p = kmap_atomic(bvec->bv_page);
+	memcpy(dst_p + bvec_offset, src_p + sh_offset, len);
+	kunmap_atomic(dst_p);
+	kunmap_atomic(src_p);
+}
+
+/*
+ * copy data from a chunk_map to a bio
+ */
+static void r5c_copy_chunk_map_to_bio(struct r5c_chunk_map *chunk_map,
+			 struct bio *bio)
+{
+	struct bvec_iter iter;
+	struct bio_vec bvec;
+	int sh_idx;
+	unsigned sh_offset;
+
+	sh_idx = 0;
+	sh_offset = (bio->bi_iter.bi_sector & ((sector_t)STRIPE_SECTORS-1)) << 9;
+
+	/*
+	 * If bio is not page aligned, the chunk_map will have 1 more sh than bvecs
+	 * in the bio. Chunk_map may also have NULL-sh. To copy the right data, we
+	 * need to walk through the chunk_map carefully. In this implementation,
+	 * bvec/bvec_offset always matches with sh_array[sh_idx]/sh_offset.
+	 *
+	 * In the following example, the nested loop will run 4 times; and
+	 * r5c_copy_data_to_bvec will be called for the first and last iteration.
+	 *
+	 *             --------------------------------
+	 * chunk_map   | valid sh |  NULL  | valid sh |
+	 *             --------------------------------
+	 *                   ---------------------
+	 * bio               |         |         |
+	 *                   ---------------------
+	 *
+	 *                   |    |    |   |     |
+	 * copy_data         | Y  | N  | N |  Y  |
+	 */
+	bio_for_each_segment(bvec, bio, iter) {
+		int len;
+		unsigned bvec_offset = bvec.bv_offset;
+		while (bvec_offset < PAGE_SIZE) {
+			len = min_t(unsigned, PAGE_SIZE - bvec_offset, PAGE_SIZE - sh_offset);
+			if (chunk_map->sh_array[sh_idx])
+				r5c_copy_data_to_bvec(&chunk_map->sh_array[sh_idx]->dev[chunk_map->dd_idx], sh_offset,
+						      &bvec, bvec_offset, len);
+			bvec_offset += len;
+			sh_offset += len;
+			if (sh_offset == PAGE_SIZE) {
+				sh_idx += 1;
+				sh_offset = 0;
+			}
+		}
+	}
+	return;
+}
+
+/*
+ * release stripes in chunk_map and free the chunk_map
+ */
+static void free_r5c_chunk_map(struct r5c_chunk_map *chunk_map)
+{
+	unsigned sh_idx;
+	struct stripe_head *sh;
+
+	for (sh_idx = 0; sh_idx < chunk_map->sh_count; ++sh_idx) {
+		sh = chunk_map->sh_array[sh_idx];
+		if (sh) {
+			set_bit(STRIPE_HANDLE, &sh->state);
+			raid5_release_stripe(sh);
+		}
+	}
+	kfree(chunk_map);
+}
+
+static void r5c_chunk_aligned_read_endio(struct bio *bio)
+{
+	struct r5c_chunk_map *chunk_map = (struct r5c_chunk_map *) bio->bi_private;
+	struct bio *parent_bi = chunk_map->parent_bi;
+
+	r5c_copy_chunk_map_to_bio(chunk_map, bio);
+	free_r5c_chunk_map(chunk_map);
+	bio_put(bio);
+	bio_endio(parent_bi);
+}
+
+/*
+ * look up bio in stripe cache
+ * return raid_bio	-> no data in cache, read the chunk from disk
+ * return new r5c_bio	-> partial data in cache, read from disk, and amend in r5c_align_endio
+ * return NULL		-> all data in cache, no need to read disk
+ */
+struct bio *r5c_lookup_chunk(struct r5l_log *log, struct bio *raid_bio)
+{
+	struct r5conf *conf;
+	sector_t logical_sector;
+	sector_t first_stripe, last_stripe;  /* first (inclusive) stripe and last (exclusive) */
+	int dd_idx;
+	struct stripe_head *sh;
+	unsigned sh_count, sh_idx, sh_cached;
+	struct r5c_chunk_map *chunk_map;
+	struct bio *r5c_bio;
+	int hash;
+	unsigned long flags;
+
+	if (!log)
+		return raid_bio;
+
+	conf = log->rdev->mddev->private;
+
+	logical_sector = raid_bio->bi_iter.bi_sector &
+		~((sector_t)STRIPE_SECTORS-1);
+	sh_count = DIV_ROUND_UP_SECTOR_T(bio_end_sector(raid_bio) - logical_sector, STRIPE_SECTORS);
+
+	first_stripe = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL);
+	last_stripe = first_stripe + STRIPE_SECTORS * sh_count;
+
+	chunk_map = kzalloc(sizeof(struct r5c_chunk_map) + sh_count * sizeof(struct stripe_head*), GFP_NOIO);
+	sh_cached = 0;
+
+	for (sh_idx = 0; sh_idx < sh_count; ++sh_idx) {
+		hash = stripe_hash_locks_hash(first_stripe + sh_idx * STRIPE_SECTORS);
+		spin_lock_irqsave(conf->hash_locks + hash, flags);
+		sh = __find_stripe(conf, first_stripe + sh_idx * STRIPE_SECTORS, conf->generation);
+		if (sh &&
+		    test_bit(R5_UPTODATE, &sh->dev[dd_idx].flags)) {
+			if (!atomic_inc_not_zero(&sh->count)) {
+				spin_lock(&conf->device_lock);
+				if (!atomic_read(&sh->count)) {
+					if (!test_bit(STRIPE_HANDLE, &sh->state))
+						atomic_inc(&conf->active_stripes);
+					BUG_ON(list_empty(&sh->lru) &&
+					       !test_bit(STRIPE_EXPANDING, &sh->state));
+					list_del_init(&sh->lru);
+					if (sh->group) {
+						sh->group->stripes_cnt--;
+						sh->group = NULL;
+					}
+				}
+				atomic_inc(&sh->count);
+				spin_unlock(&conf->device_lock);
+			}
+			chunk_map->sh_array[sh_idx] = sh;
+			++sh_cached;
+		}
+		spin_unlock_irqrestore(conf->hash_locks + hash, flags);
+	}
+
+	if (sh_cached == 0) {
+		atomic64_inc(&log->cache.read_misses);
+		kfree(chunk_map);
+		return raid_bio;
+	}
+
+	chunk_map->sh_count = sh_count;
+	chunk_map->dd_idx = dd_idx;
+
+	if (sh_cached == sh_count) {
+		atomic64_inc(&log->cache.read_full_hits);
+		r5c_copy_chunk_map_to_bio(chunk_map, raid_bio);
+		free_r5c_chunk_map(chunk_map);
+		bio_endio(raid_bio);
+		return NULL;
+	}
+
+	chunk_map->parent_bi = raid_bio;
+	chunk_map->conf = conf;
+
+	atomic64_inc(&log->cache.read_partial_hits);
+
+	/* TODO: handle bio_clone failure? */
+	r5c_bio = bio_clone_mddev(raid_bio, GFP_NOIO, log->rdev->mddev);
+
+	r5c_bio->bi_private = chunk_map;
+	r5c_bio->bi_end_io = r5c_chunk_aligned_read_endio;
+
+	return r5c_bio;
+}
+
+ssize_t
+r5c_stat_show(struct mddev *mddev, char* page)
+{
+	struct r5conf *conf = mddev->private;
+	struct r5l_log *log;
+	int ret = 0;
+
+	if (!conf)
+		return 0;
+
+	log = conf->log;
+
+	if (!log)
+		return 0;
+
+	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_full_hits: %llu\n",
+			(unsigned long long) atomic64_read(&log->cache.read_full_hits));
+
+	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_partial_hits: %llu\n",
+			(unsigned long long) atomic64_read(&log->cache.read_partial_hits));
+
+	ret += snprintf(page + ret, PAGE_SIZE - ret, "r5c_read_misses: %llu\n",
+			(unsigned long long) atomic64_read(&log->cache.read_misses));
+
+	return ret;
+}
+
 static int r5l_load_log(struct r5l_log *log)
 {
 	struct md_rdev *rdev = log->rdev;
@@ -1239,6 +1475,8 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	INIT_LIST_HEAD(&log->no_space_stripes);
 	spin_lock_init(&log->no_space_stripes_lock);
 
+	init_r5c_cache(conf, &log->cache);
+
 	if (r5l_load_log(log))
 		goto error;
 
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index dc24b664..cdd9c4b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -503,7 +503,7 @@ retry:
 	set_bit(STRIPE_BATCH_READY, &sh->state);
 }
 
-static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
+struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 					 short generation)
 {
 	struct stripe_head *sh;
@@ -515,6 +515,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
 	pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
 	return NULL;
 }
+EXPORT_SYMBOL(__find_stripe);
 
 /*
  * Need to check if array has failed when deciding whether to:
@@ -4726,7 +4727,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 {
 	struct r5conf *conf = mddev->private;
 	int dd_idx;
-	struct bio* align_bi;
+	struct bio *align_bi;
+	struct bio *r5c_bio;
 	struct md_rdev *rdev;
 	sector_t end_sector;
 
@@ -4734,6 +4736,18 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
 		pr_debug("%s: non aligned\n", __func__);
 		return 0;
 	}
+
+	r5c_bio = r5c_lookup_chunk(conf->log, raid_bio);
+
+	if (r5c_bio == NULL) {
+		pr_debug("Read all data from stripe cache\n");
+		return 1;
+	} else if (r5c_bio == raid_bio)
+		pr_debug("No data in stripe cache, read all from disk\n");
+	else {
+		pr_debug("Partial data in stripe cache, read and amend\n");
+		raid_bio = r5c_bio;
+	}
 	/*
 	 * use bio_clone_mddev to make a copy of the bio
 	 */
@@ -6157,6 +6171,10 @@ raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR,
 				raid5_show_group_thread_cnt,
 				raid5_store_group_thread_cnt);
 
+static struct md_sysfs_entry
+r5c_cache_stats = __ATTR(r5c_cache_stats, S_IRUGO,
+			 r5c_stat_show, NULL);
+
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
@@ -6164,6 +6182,7 @@ static struct attribute *raid5_attrs[] =  {
 	&raid5_group_thread_cnt.attr,
 	&raid5_skip_copy.attr,
 	&raid5_rmw_level.attr,
+	&r5c_cache_stats.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 3b68d4f..de11514 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -690,4 +690,10 @@ extern void r5l_stripe_write_finished(struct stripe_head *sh);
 extern int r5l_handle_flush_request(struct r5l_log *log, struct bio *bio);
 extern void r5l_quiesce(struct r5l_log *log, int state);
 extern bool r5l_log_disk_error(struct r5conf *conf);
+
+extern struct bio *r5c_lookup_chunk(struct r5l_log *log, struct bio *raid_bio);
+extern struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector,
+					 short generation);
+
+extern ssize_t r5c_stat_show(struct mddev *mddev, char* page);
 #endif
-- 
2.8.0.rc2

--
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux