[PATCH 20/24] dm cache: add era policy shim

Mike Snitzer <snitzer@xxxxxxxxxx> · Thu, 24 Oct 2013 14:30:33 -0400

From: Morgan Mears <morgan.mears@xxxxxxxxxx>

This commit includes a non-terminal policy (aka "shim") called era that
may be stacked ontop of a terminal policy (e.g. mq).

The era policy adds:
- an era number to every cache block that gets updated on write hits
- an interface that allows an application to read and increment the
  current era value
- an interface to invalidate cache blocks that have been written to
  before or after a given era

This functionality can be used to partially invalidate the cache
contents to restore cache coherency after a snapshot rollback.

Signed-off-by: Morgan Mears <morgan.mears@xxxxxxxxxx>
Signed-off-by: Heinz Mauelshagen <heinzm@xxxxxxxxxx>
Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>
---
 drivers/md/Kconfig               |  17 ++
 drivers/md/Makefile              |   2 +
 drivers/md/dm-cache-policy-era.c | 428 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 447 insertions(+)
 create mode 100644 drivers/md/dm-cache-policy-era.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 816e023..ad32101 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -282,6 +282,23 @@ config DM_CACHE_MQ
          This is meant to be a general purpose policy.  It prioritises
          reads over writes.
 
+config DM_CACHE_ERA
+       tristate "ERA Cache Policy shim (EXPERIMENTAL)"
+       depends on DM_CACHE
+       ---help---
+	 A cache policy shim that adds an "era" property to the
+	 per-cache-block metadata, to facilitate the implementation of
+	 cache coherency validation and recovery tools.	 This mechanism
+	 works as follows.  There is a monotonically increasing 32-bit
+	 era counter associated with each cache instance.  Each cache
+	 block is tagged with the era during which it was last written.
+	 A device mapper message interface is provided to obtain the
+	 current era, advance to the next era, and invalidate blocks
+	 from before or after a given era.  Note that you can use this
+	 policy shim to add the era functionality to any cache policy
+	 via name concatenation -- specify era+mq instead of just mq to
+	 add the era mechanism to the mq policy, for example.
+
 config DM_CACHE_CLEANER
        tristate "Cleaner Cache Policy (EXPERIMENTAL)"
        depends on DM_CACHE
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 5f6dfc3..0ae00bd 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -15,6 +15,7 @@ dm-cache-y	+= dm-cache-target.o dm-cache-metadata.o dm-cache-policy.o \
 		   dm-cache-shim-utils.o dm-cache-stack-utils.o
 dm-cache-mq-y   += dm-cache-policy-mq.o
 dm-cache-cleaner-y += dm-cache-policy-cleaner.o
+dm-cache-era-y	+= dm-cache-policy-era.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 
@@ -53,6 +54,7 @@ obj-$(CONFIG_DM_VERITY)		+= dm-verity.o
 obj-$(CONFIG_DM_CACHE)		+= dm-cache.o
 obj-$(CONFIG_DM_CACHE_MQ)	+= dm-cache-mq.o
 obj-$(CONFIG_DM_CACHE_CLEANER)	+= dm-cache-cleaner.o
+obj-$(CONFIG_DM_CACHE_ERA)	+= dm-cache-era.o
 
 ifeq ($(CONFIG_DM_UEVENT),y)
 dm-mod-objs			+= dm-uevent.o
diff --git a/drivers/md/dm-cache-policy-era.c b/drivers/md/dm-cache-policy-era.c
new file mode 100644
index 0000000..427514c
--- /dev/null
+++ b/drivers/md/dm-cache-policy-era.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2013 NetApp, Inc. All Rights Reserved, contribution by
+ * Morgan Mears.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ * for more details
+ *
+ */
+
+#include "dm-cache-policy.h"
+#include "dm-cache-policy-internal.h"
+#include "dm-cache-shim-utils.h"
+#include "dm.h"
+
+#include <linux/hash.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <linux/delay.h>
+
+#define DEBUG_ERA 0
+
+#define DM_MSG_PREFIX "cache-policy-era"
+
+typedef uint32_t era_t;
+#define ERA_MAX_ERA UINT_MAX
+
+struct era_policy {
+	struct dm_cache_policy policy;
+
+	struct mutex lock;	/* FIXME: spinlock? */
+
+	dm_cblock_t cache_size;
+
+	era_t *cb_to_era;
+
+	era_t era_counter;
+};
+
+/*----------------------------------------------------------------*/
+
+static struct era_policy *to_era_policy(struct dm_cache_policy *p)
+{
+	return container_of(p, struct era_policy, policy);
+}
+
+static int incr_era_counter(struct era_policy *era, const char *curr_era_str)
+{
+	era_t curr_era_counter;
+	int r;
+
+	/*
+	 * If the era counter value provided by the user matches the current
+	 * counter value while under lock, increment the counter (intention
+	 * is to prevent races).  Rollover problems are avoided by locking
+	 * the counter at a maximum value.  The application must take
+	 * appropriate action on this error to preserve correction, but
+	 * a properly behaved set of applications will never trigger it;
+	 * the era counter is meant to increment less than once a second
+	 * and is 32 bits.
+	 */
+
+	if (kstrtou32(curr_era_str, 10, &curr_era_counter))
+		return -EINVAL;
+
+	smp_rmb();
+	if (era->era_counter != curr_era_counter)
+		r = -ECANCELED;
+	else if (era->era_counter >= ERA_MAX_ERA)
+		r = -EOVERFLOW;
+	else {
+		era->era_counter++;
+		smp_wmb();
+		r = 0;
+	}
+
+	return r;
+}
+
+static void *era_cblock_to_hint(struct shim_walk_map_ctx *ctx,
+				dm_cblock_t cblock, dm_oblock_t oblock)
+{
+	struct era_policy *era = to_era_policy(ctx->my_policy);
+	era_t era_val;
+	era_val = era->cb_to_era[from_cblock(cblock)];
+#if DEBUG_ERA
+	DMDEBUG("storing era %u for cblock %u.", era_val, cblock);
+#endif
+	ctx->le32_buf = cpu_to_le32(era_val);
+	return &ctx->le32_buf;
+}
+
+static int era_is_gt_value(era_t era, era_t value)
+{
+	return era > value;
+}
+
+static int era_is_gte_value(era_t era, era_t value)
+{
+	return era >= value;
+}
+
+static int era_is_lte_value(era_t era, era_t value)
+{
+	return era <= value;
+}
+
+static int era_is_lt_value(era_t era, era_t value)
+{
+	return era < value;
+}
+
+typedef int (*era_match_fn_t)(era_t, era_t);
+
+struct inval_oblocks_ctx {
+	struct era_policy *era;
+	era_match_fn_t era_match_fn;
+	era_t test_era;
+};
+
+static int era_inval_oblocks(void *context, dm_cblock_t cblock,
+			     dm_oblock_t oblock, void *unused)
+{
+	struct inval_oblocks_ctx *ctx = (struct inval_oblocks_ctx *)context;
+	struct dm_cache_policy *child;
+	era_t act_era;
+
+	act_era = ctx->era->cb_to_era[from_cblock(cblock)];
+	if (ctx->era_match_fn(act_era, ctx->test_era)) {
+#if DEBUG_ERA
+		DMDEBUG("cblock %u has era %u matching test_era %u; "
+			"marking mapping to be removed for oblock %llu.",
+			from_cblock(cblock), act_era, ctx->test_era, oblock);
+#endif
+		child = ctx->era->policy.child;
+
+		/*
+		 * This deadlocks (lock against self) because child is calling
+		 * us via the walk_mappings context callback, child's
+		 * walk_mappings holds child's lock, and child's remove_mappings
+		 * tries to get it again.  Not fixing because I believe the
+		 * invalidate API is going to change.
+		 */
+		/* child->remove_mapping(child, oblock); */
+	}
+
+	return 0;
+}
+
+static int cond_unmap_by_era(struct era_policy *era, const char *test_era_str,
+			     era_match_fn_t era_match_fn)
+{
+	struct shim_walk_map_ctx ctx;
+	struct inval_oblocks_ctx io_ctx;
+	era_t test_era;
+	int r;
+
+	/*
+	 * Unmap blocks with eras matching the given era, according to the
+	 * given matching function.
+	 */
+
+	if (kstrtou32(test_era_str, 10, &test_era))
+		return -EINVAL;
+
+	io_ctx.era = era;
+	io_ctx.era_match_fn = era_match_fn;
+	io_ctx.test_era = test_era;
+
+	ctx.parent_ctx = &io_ctx;
+	ctx.parent_fn = era_inval_oblocks;
+	ctx.my_policy = &era->policy;
+	ctx.child_hint_buf = NULL;
+	ctx.cblock_to_hint_fn = NULL;
+
+	mutex_lock(&era->lock);
+	r = dm_cache_shim_utils_walk_map_with_ctx(&ctx);
+	mutex_unlock(&era->lock);
+
+	return r;
+}
+
+/*
+ * Public interface, via the policy struct.  See dm-cache-policy.h for a
+ * description of these.
+ */
+
+static void era_destroy(struct dm_cache_policy *p)
+{
+	struct era_policy *era = to_era_policy(p);
+#if DEBUG_ERA
+	DMDEBUG("destroyed era %p", era);
+#endif
+	kfree(era->cb_to_era);
+	kfree(era);
+}
+
+static int era_map(struct dm_cache_policy *p, dm_oblock_t oblock,
+		      bool can_block, bool can_migrate, bool discarded_oblock,
+		      struct bio *bio, struct policy_result *result)
+{
+	struct era_policy *era = to_era_policy(p);
+	uint32_t cb_idx;
+	int r;
+
+	result->op = POLICY_MISS;
+
+	if (can_block)
+		mutex_lock(&era->lock);
+	else if (!mutex_trylock(&era->lock))
+		return -EWOULDBLOCK;
+
+	/* Check for a mapping */
+	r = policy_map(p->child, oblock, can_block, can_migrate,
+		       discarded_oblock, bio, result);
+
+	/* If we got a hit and this is a write, update the era for the block */
+	if (!r && (bio_data_dir(bio) == WRITE) && (result->op == POLICY_HIT)) {
+		cb_idx = from_cblock(result->cblock);
+		BUG_ON(cb_idx >= from_cblock(era->cache_size));
+		smp_rmb();
+		era->cb_to_era[cb_idx] = era->era_counter;
+#if DEBUG_ERA
+		DMDEBUG("assigned era %u to cblock %u, oblock %llu due to write hit.",
+			era->era_counter, result->cblock, oblock);
+#endif
+	}
+
+	mutex_unlock(&era->lock);
+
+	return r;
+}
+
+static int era_load_mapping(struct dm_cache_policy *p,
+			    dm_oblock_t oblock, dm_cblock_t cblock,
+			    void *hint, bool hint_valid)
+{
+	struct era_policy *era = to_era_policy(p);
+	struct dm_cache_policy *child;
+	__le32 *le32_hint;
+	era_t recovered_era;
+	int r;
+
+	child = era->policy.child;
+
+	le32_hint = (__le32 *)hint;
+	hint = &le32_hint[1];
+
+	r = policy_load_mapping(child, oblock, cblock, hint, hint_valid);
+
+	if (!r && hint_valid &&
+	    (from_cblock(cblock) < from_cblock(era->cache_size))) {
+		recovered_era = le32_to_cpu(*le32_hint);
+#if DEBUG_ERA
+		DMDEBUG("recovered era %u for cblock %u.", recovered_era, cblock);
+#endif
+		era->cb_to_era[from_cblock(cblock)] = recovered_era;
+
+		/*
+		 * Make sure the era counter starts higher than the highest
+		 * persisted era.
+		 */
+		smp_rmb();
+		if (recovered_era >= era->era_counter) {
+			era->era_counter = recovered_era;
+			if (era->era_counter < ERA_MAX_ERA)
+				era->era_counter++;
+			smp_wmb();
+#if DEBUG_ERA
+			DMDEBUG("set era_counter to %u.", era->era_counter);
+#endif
+		}
+	}
+
+	return r;
+}
+
+static int era_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn,
+			     void *context)
+{
+	return dm_cache_shim_utils_walk_map(p, fn, context, era_cblock_to_hint);
+}
+
+static void era_force_mapping(struct dm_cache_policy *p, dm_oblock_t old_oblock,
+			      dm_oblock_t new_oblock)
+{
+	struct era_policy *era = to_era_policy(p);
+	dm_cblock_t cblock;
+
+	mutex_lock(&era->lock);
+
+	if (!policy_lookup(p->child, old_oblock, &cblock)) {
+		smp_rmb();
+		era->cb_to_era[from_cblock(cblock)] = era->era_counter;
+#if DEBUG_ERA
+		DMDEBUG("assigned era %u to cblock %u, oblock %llu "
+			"(old_oblock %llu) due to force_mapping.",
+			era->era_counter, cblock, new_oblock, old_oblock);
+#endif
+	}
+
+	policy_force_mapping(p->child, old_oblock, new_oblock);
+
+	mutex_unlock(&era->lock);
+}
+
+static int era_set_config_value(struct dm_cache_policy *p, const char *key,
+				const char *value)
+{
+	struct era_policy *era = to_era_policy(p);
+	int r;
+
+	if (!strcasecmp(key, "increment_era_counter"))
+		r = incr_era_counter(era, value);
+	else if (!strcasecmp(key, "unmap_blocks_from_later_eras"))
+		r = cond_unmap_by_era(era, value, era_is_gt_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_this_era_and_later"))
+		r = cond_unmap_by_era(era, value, era_is_gte_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_this_era_and_earlier"))
+		r = cond_unmap_by_era(era, value, era_is_lte_value);
+	else if (!strcasecmp(key, "unmap_blocks_from_earlier_eras"))
+		r = cond_unmap_by_era(era, value, era_is_lt_value);
+	else
+		r = policy_set_config_value(p->child, key, value);
+
+	return r;
+}
+
+static int era_emit_config_values(struct dm_cache_policy *p, char *result,
+				  unsigned maxlen)
+{
+	struct era_policy *era = to_era_policy(p);
+	ssize_t sz = 0;
+
+	smp_rmb();
+	DMEMIT("era_counter %u ", era->era_counter);
+	return policy_emit_config_values(p->child, result + sz, maxlen - sz);
+}
+
+/* Init the policy plugin interface function pointers. */
+static void init_policy_functions(struct era_policy *era)
+{
+	dm_cache_shim_utils_init_shim_policy(&era->policy);
+	era->policy.destroy = era_destroy;
+	era->policy.map = era_map;
+	era->policy.load_mapping = era_load_mapping;
+	era->policy.walk_mappings = era_walk_mappings;
+	era->policy.force_mapping = era_force_mapping;
+	era->policy.emit_config_values = era_emit_config_values;
+	era->policy.set_config_value = era_set_config_value;
+}
+
+static struct dm_cache_policy *era_create(dm_cblock_t cache_size,
+					  sector_t origin_size,
+					  sector_t cache_block_size)
+{
+	struct era_policy *era = kzalloc(sizeof(*era), GFP_KERNEL);
+
+	if (!era)
+		return NULL;
+
+	init_policy_functions(era);
+	era->cache_size = cache_size;
+	mutex_init(&era->lock);
+
+	era->cb_to_era = kzalloc(from_cblock(era->cache_size) *
+				 sizeof(*(era->cb_to_era)), GFP_KERNEL);
+	if (!era->cb_to_era)
+		goto bad_alloc_cb_to_era;
+	era->era_counter = 1;
+
+	return &era->policy;
+
+bad_alloc_cb_to_era:
+	kfree(era);
+	return NULL;
+}
+
+/*----------------------------------------------------------------*/
+
+static struct dm_cache_policy_type era_policy_type = {
+	.name = "era",
+	.version = {1, 0, 0},
+	.hint_size = 4,
+	.owner = THIS_MODULE,
+	.create = era_create,
+	.features = DM_CACHE_POLICY_SHIM
+};
+
+static int __init era_init(void)
+{
+	int r;
+
+	r = dm_cache_policy_register(&era_policy_type);
+	if (!r) {
+		DMINFO("version %u.%u.%u loaded",
+		       era_policy_type.version[0],
+		       era_policy_type.version[1],
+		       era_policy_type.version[2]);
+		return 0;
+	}
+
+	DMERR("register failed %d", r);
+
+	dm_cache_policy_unregister(&era_policy_type);
+	return -ENOMEM;
+}
+
+static void __exit era_exit(void)
+{
+	dm_cache_policy_unregister(&era_policy_type);
+}
+
+module_init(era_init);
+module_exit(era_exit);
+
+MODULE_AUTHOR("Morgan Mears <dm-devel@xxxxxxxxxx>");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("era cache policy shim");
-- 
1.8.1.4

--
dm-devel mailing list
dm-devel@xxxxxxxxxx
https://www.redhat.com/mailman/listinfo/dm-devel